[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] DAGMan segfaults



I'm trying to get DAGMan working, and just have a very simply test; the
relevant files are:

test.dag
========
job a test.sub
job b test.sub
parent a child b

test.sub
========
universe=vanilla
executable=test.sh
output=test.out.$(CLUSTER)
error=test.err.$(CLUSTER)
arguments=$(CLUSTER) 
log=test.log
notification=never

should_transfer_files=yes
when_to_transfer_output=on_exit

queue

test.sh
=======
#!/bin/bash
echo hello $1

When I condor_submit_dag test.dag, condor_q shows condor_dagman running
for about 10-15 seconds. Three things then happen at the same time:

1) condor_dagman leaves the queue, because..
2) the dagman log shows "Abnormal termination (signal 11)"
3) test.sh (job a) enters the queue

Job a then runs fine, but obviously because condor_dagman died nothing
else happens after that. I'm using

$CondorVersion: 7.0.4 Jul 16 2008 BuildID: 95033 $
$CondorPlatform: I386-LINUX_RHEL5 $

and I'm running Ubuntu Intrepid and Hardy; I get the same behaviour on
both, and I've never had a complaint from any other condor binary. I've
attached strace to the condor_dagman process after it starts, and have
attached the result of that.

Adam
Process 11146 attached - interrupt to quit
select(0, NULL, NULL, NULL, {2, 112000}) = 0 (Timeout)
open("/proc/uptime", O_RDONLY)          = 9
fcntl(9, F_GETFL)                       = 0x8000 (flags O_RDONLY|O_LARGEFILE)
fstat(9, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc353f7f000
lseek(9, 0, SEEK_CUR)                   = 0
read(9, "176061.69 170829.17\n", 1024)  = 20
close(9)                                = 0
munmap(0x7fc353f7f000, 4096)            = 0
write(8, "CONFIRM = 17606169\nCONTROL_TIME "..., 36) = 36
close(8)                                = 0
munmap(0x7fc353f80000, 16384)           = 0
rt_sigprocmask(SIG_BLOCK, ~[ILL TRAP ABRT BUS FPE SEGV RTMIN RT_1], ~[ILL TRAP ABRT BUS FPE KILL SEGV STOP RTMIN RT_1], 8) = 0
umask(022)                              = 022
gettimeofday({1218728285, 581157}, NULL) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=3661, ...}) = 0
open("/home/alt36/test/dag/test.dag.dagman.out", O_WRONLY|O_CREAT|O_EXCL|O_APPEND, 0644) = -1 EEXIST (File exists)
open("/home/alt36/test/dag/test.dag.dagman.out", O_WRONLY|O_APPEND) = 8
fcntl(8, F_GETFL)                       = 0x8401 (flags O_WRONLY|O_APPEND|O_LARGEFILE)
fstat(8, {st_mode=S_IFREG|0700, st_size=18195, ...}) = 0
mmap(NULL, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc353f80000
lseek(8, 0, SEEK_CUR)                   = 0
lseek(8, 0, SEEK_END)                   = 18195
write(8, "8/14 16:38:05 Bootstrapping...\n", 31) = 31
close(8)                                = 0
munmap(0x7fc353f80000, 16384)           = 0
umask(022)                              = 022
rt_sigprocmask(SIG_SETMASK, ~[ILL TRAP ABRT BUS FPE KILL SEGV STOP RTMIN RT_1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, ~[ILL TRAP ABRT BUS FPE SEGV RTMIN RT_1], ~[ILL TRAP ABRT BUS FPE KILL SEGV STOP RTMIN RT_1], 8) = 0
umask(022)                              = 022
gettimeofday({1218728285, 584570}, NULL) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=3661, ...}) = 0
open("/home/alt36/test/dag/test.dag.dagman.out", O_WRONLY|O_CREAT|O_EXCL|O_APPEND, 0644) = -1 EEXIST (File exists)
open("/home/alt36/test/dag/test.dag.dagman.out", O_WRONLY|O_APPEND) = 8
fcntl(8, F_GETFL)                       = 0x8401 (flags O_WRONLY|O_APPEND|O_LARGEFILE)
fstat(8, {st_mode=S_IFREG|0700, st_size=18226, ...}) = 0
mmap(NULL, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc353f80000
lseek(8, 0, SEEK_CUR)                   = 0
lseek(8, 0, SEEK_END)                   = 18226
write(8, "8/14 16:38:05 Number of pre-comp"..., 47) = 47
close(8)                                = 0
munmap(0x7fc353f80000, 16384)           = 0
umask(022)                              = 022
rt_sigprocmask(SIG_SETMASK, ~[ILL TRAP ABRT BUS FPE KILL SEGV STOP RTMIN RT_1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, ~[ILL TRAP ABRT BUS FPE SEGV RTMIN RT_1], ~[ILL TRAP ABRT BUS FPE KILL SEGV STOP RTMIN RT_1], 8) = 0
umask(022)                              = 022
gettimeofday({1218728285, 587079}, NULL) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=3661, ...}) = 0
open("/home/alt36/test/dag/test.dag.dagman.out", O_WRONLY|O_CREAT|O_EXCL|O_APPEND, 0644) = -1 EEXIST (File exists)
open("/home/alt36/test/dag/test.dag.dagman.out", O_WRONLY|O_APPEND) = 8
fcntl(8, F_GETFL)                       = 0x8401 (flags O_WRONLY|O_APPEND|O_LARGEFILE)
fstat(8, {st_mode=S_IFREG|0700, st_size=18273, ...}) = 0
mmap(NULL, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc353f80000
lseek(8, 0, SEEK_CUR)                   = 0
lseek(8, 0, SEEK_END)                   = 18273
write(8, "8/14 16:38:05 Running in RECOVER"..., 42) = 42
close(8)                                = 0
munmap(0x7fc353f80000, 16384)           = 0
umask(022)                              = 022
rt_sigprocmask(SIG_SETMASK, ~[ILL TRAP ABRT BUS FPE KILL SEGV STOP RTMIN RT_1], NULL, 8) = 0
stat("/home/alt36/test/dag/test.log", {st_mode=S_IFREG|0700, st_size=629, ...}) = 0
lstat("/home/alt36/test/dag/test.log", {st_mode=S_IFREG|0700, st_size=629, ...}) = 0
open("/home/alt36/test/dag/test.log", O_RDWR) = 8
fcntl(8, F_GETFL)                       = 0x8002 (flags O_RDWR|O_LARGEFILE)
fstat(8, {st_mode=S_IFREG|0700, st_size=629, ...}) = 0
mmap(NULL, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc353f80000
lseek(8, 0, SEEK_CUR)                   = 0
--- SIGSEGV (Segmentation fault) @ 0 (0) ---
Process 11146 detached