Mailing List Archives
Authenticated access
|
|
|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [HTCondor-users] transfer_in/output_files only if they exist
- Date: Thu, 28 Feb 2019 16:38:58 +0000
- From: Duncan Brown <dabrown@xxxxxxx>
- Subject: Re: [HTCondor-users] transfer_in/output_files only if they exist
Hi Todd,
More information: I did straces which suggests that condor_pid_ns_init and condor_exec.exe are both exiting with status 255, as they are supposed to. The problem also happens when I try and use a signal for checkpointing, rather than an exit code:
+CheckpointExitBySignal = True
+CheckpointExitSignal = 10
6070287.0 dbrown 2/28 10:42 Error from slot1@CRUSH-SUGWG-OSG-10-5-228-85: Job did not exit as promised when sent its checkpoint signal. Promised exit was on signal 10, actual exit status was on signal 10.
Traces below.
Cheers,
Duncan.
Here's the trace with exit code 255:
[dbrown@CRUSH-SUGWG-OSG-10-5-228-85 ~]$ strace -p 20262
strace: Process 20262 attached
wait4(-1, 0x7ffff6f413c0, 0, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGUSR1 {si_signo=SIGUSR1, si_code=SI_USER, si_pid=1, si_uid=620} ---
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [USR1], [], 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [USR1], 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [USR1], 8) = 0
open("wrapper.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3
fcntl(1, F_GETFD) = 0
fcntl(1, F_DUPFD, 10) = 10
fcntl(1, F_GETFD) = 0
fcntl(10, F_SETFD, FD_CLOEXEC) = 0
dup2(3, 1) = 1
close(3) = 0
fcntl(2, F_DUPFD, 10) = 11
fcntl(2, F_GETFD) = 0
fcntl(11, F_SETFD, FD_CLOEXEC) = 0
dup2(1, 2) = 2
write(1, "checkpoint_trap function called,"..., 58) = 58
dup2(11, 2) = 2
fcntl(11, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(11) = 0
dup2(10, 1) = 1
fcntl(10, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(10) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [USR1], 8) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2ab87d19e990) = 57
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [USR1], 8) = 0
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [USR1], 8) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, 8) = 0
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 57
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=57, si_uid=620, si_status=0, si_utime=0, si_stime=0} ---
wait4(-1, 0x7ffff6f40910, WNOHANG, NULL) = 0
rt_sigreturn({mask=[USR1]}) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [], 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [], 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [], 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [], 8) = 0
open("wrapper.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3
fcntl(1, F_GETFD) = 0
fcntl(1, F_DUPFD, 10) = 10
fcntl(1, F_GETFD) = 0
fcntl(10, F_SETFD, FD_CLOEXEC) = 0
dup2(3, 1) = 1
close(3) = 0
fcntl(2, F_DUPFD, 10) = 11
fcntl(2, F_GETFD) = 0
fcntl(11, F_SETFD, FD_CLOEXEC) = 0
dup2(1, 2) = 2
write(1, "cleanup: killing any strace proc"..., 36) = 36
dup2(11, 2) = 2
fcntl(11, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(11) = 0
dup2(10, 1) = 1
fcntl(10, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(10) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [], 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
stat(".", {st_mode=S_IFDIR|0700, st_size=4096, ...}) = 0
stat("/usr/local/bin/rm", 0x7ffff6f41510) = -1 ENOENT (No such file or directory)
stat("/usr/bin/rm", {st_mode=S_IFREG|0755, st_size=62864, ...}) = 0
stat("/usr/bin/rm", {st_mode=S_IFREG|0755, st_size=62864, ...}) = 0
geteuid() = 620
getegid() = 1010
getuid() = 620
getgid() = 1010
access("/usr/bin/rm", X_OK) = 0
stat("/usr/bin/rm", {st_mode=S_IFREG|0755, st_size=62864, ...}) = 0
geteuid() = 620
getegid() = 1010
getuid() = 620
getgid() = 1010
access("/usr/bin/rm", R_OK) = 0
stat("/usr/bin/rm", {st_mode=S_IFREG|0755, st_size=62864, ...}) = 0
stat("/usr/bin/rm", {st_mode=S_IFREG|0755, st_size=62864, ...}) = 0
geteuid() = 620
getegid() = 1010
getuid() = 620
getgid() = 1010
access("/usr/bin/rm", X_OK) = 0
stat("/usr/bin/rm", {st_mode=S_IFREG|0755, st_size=62864, ...}) = 0
geteuid() = 620
getegid() = 1010
getuid() = 620
getgid() = 1010
access("/usr/bin/rm", R_OK) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [], 8) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2ab87d19e990) = 58
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, 8) = 0
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 58
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=58, si_uid=620, si_status=0, si_utime=0, si_stime=0} ---
wait4(-1, 0x7ffff6f41050, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, {0x43e780, [], SA_RESTORER, 0x2ab87d7dd270}, 8) = 0
exit_group(255) = ?
+++ exited with 255 +++
[dbrown@CRUSH-SUGWG-OSG-10-5-228-85 ~]$ strace -p 20482
strace: Process 20482 attached
wait4(-1, 0x7ffeefba7fac, 0, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGUSR1 {si_signo=SIGUSR1, si_code=SI_USER, si_pid=0, si_uid=0} ---
kill(3, SIGUSR1) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 255}], 0, NULL) = 3
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=3, si_uid=620, si_status=255, si_utime=0, si_stime=0} ---
stat("/var/lib/condor/execute/dir_20478/.condor_pid_ns_status", 0x7ffeefba7fb0) = -1 ENOENT (No such file or directory)
open("/var/lib/condor/execute/dir_20478/.condor_pid_ns_status", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 3
fstat(3, {st_mode=S_IFREG|0640, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b10e36e3000
write(3, "Exited: 65280\n", 14) = 14
close(3) = 0
munmap(0x2b10e36e3000, 4096) = 0
exit_group(255) = ?
+++ exited with 255 +++
Here's the trace with SIGUSR1:
wait4(-1, 0x7ffcef6aa890, 0, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGUSR1 {si_signo=SIGUSR1, si_code=SI_USER, si_pid=1, si_uid=620} ---
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [USR1], [], 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [USR1], 8) = 0
rt_sigprocmask(SIG_BLOCK, NULL, [USR1], 8) = 0
open("wrapper.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3
fcntl(1, F_GETFD) = 0
fcntl(1, F_DUPFD, 10) = 10
fcntl(1, F_GETFD) = 0
fcntl(10, F_SETFD, FD_CLOEXEC) = 0
dup2(3, 1) = 1
close(3) = 0
fcntl(2, F_DUPFD, 10) = 11
fcntl(2, F_GETFD) = 0
fcntl(11, F_SETFD, FD_CLOEXEC) = 0
dup2(1, 2) = 2
write(1, "checkpoint_trap function called,"..., 58) = 58
dup2(11, 2) = 2
fcntl(11, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(11) = 0
dup2(10, 1) = 1
fcntl(10, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(10) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [USR1], 8) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2ad7c09a5990) = 57
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [USR1], 8) = 0
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [USR1], 8) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, 8) = 0
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 57
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=57, si_uid=620, si_status=0, si_utime=0, si_stime=0} ---
wait4(-1, 0x7ffcef6a9b90, WNOHANG, NULL) = 0
rt_sigreturn({mask=[USR1]}) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, 8) = 0
open("wrapper.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3
fcntl(1, F_GETFD) = 0
fcntl(1, F_DUPFD, 10) = 10
fcntl(1, F_GETFD) = 0
fcntl(10, F_SETFD, FD_CLOEXEC) = 0
dup2(3, 1) = 1
close(3) = 0
fcntl(2, F_DUPFD, 10) = 11
fcntl(2, F_GETFD) = 0
fcntl(11, F_SETFD, FD_CLOEXEC) = 0
dup2(1, 2) = 2
write(1, "unsetting USR1 trap and re-sendi"..., 49) = 49
dup2(11, 2) = 2
fcntl(11, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(11) = 0
dup2(10, 1) = 1
fcntl(10, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(10) = 0
rt_sigaction(SIGUSR1, {SIG_DFL, [], SA_RESTORER, 0x2ad7c0fe4270}, {0x4543b0, [], SA_RESTORER, 0x2ad7c0fe4270}, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [USR1], 8) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2ad7c09a5990) = 58
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [USR1], 8) = 0
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [USR1], 8) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, 8) = 0
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 58
rt_sigprocmask(SIG_SETMASK, [USR1], NULL, 8) = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=58, si_uid=620, si_status=0, si_utime=0, si_stime=0} ---
wait4(-1, 0x7ffcef6a9f10, WNOHANG, NULL) = 0
rt_sigreturn({mask=[USR1]}) = 0
rt_sigaction(SIGINT, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, {0x43e780, [], SA_RESTORER, 0x2ad7c0fe4270}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
--- SIGUSR1 {si_signo=SIGUSR1, si_code=SI_USER, si_pid=58, si_uid=620} ---
+++ killed by SIGUSR1 +++
> On Feb 28, 2019, at 8:58 AM, Duncan Brown <dabrown@xxxxxxx> wrote:
>
> Hi Todd,
>
> I've encountered the following error:
>
> 6070226.0 dbrown 2/27 17:41 Error from slot1@CRUSH-SUGWG-OSG-10-5-228-85: Job did not exit as promised when sent its checkpoint signal. Promised exit was with exit code 255, actual exit status was with exit code 255.
>
> The logs are not particularly illuminating. Starter:
>
>
> 02/27/19 17:37:37 (pid:40934) GGT GGT GGT about to set io wait to 0
> 02/27/19 17:37:37 (pid:40934) GGT GGT GGT about to set io wait to 0
> 02/27/19 17:41:15 (pid:40934) Periodic Checkpointing all jobs.
> 02/27/19 17:41:15 (pid:40934) Process exited, pid=42189, status=255
> 02/27/19 17:41:15 (pid:40934) Hold all jobs
> 02/27/19 17:41:15 (pid:40934) GGT GGT GGT about to set io wait to 0
> 02/27/19 17:41:15 (pid:40934) GGT GGT GGT about to set io wait to 0
> 02/27/19 17:41:15 (pid:40934) condor_read() failed: recv(fd=11) returned -1, errno = 104 Connection reset by peer, reading 5 bytes from <10.5.2.4:5483>.
> 02/27/19 17:41:15 (pid:40934) IO: Failed to read packet header
> 02/27/19 17:41:15 (pid:40934) Lost connection to shadow, waiting 2400 secs for reconnect
> 02/27/19 17:41:15 (pid:40934) Returning from CStarter::JobReaper()
> 02/27/19 17:41:15 (pid:40934) Got SIGQUIT. Performing fast shutdown.
> 02/27/19 17:41:15 (pid:40934) ShutdownFast all jobs.
> 02/27/19 17:41:15 (pid:40934) GGT GGT GGT about to set io wait to 0
> 02/27/19 17:41:15 (pid:40934) CREDMON: Couldn't find dir "40934" in /var/lib/condor/credential
>
> Shadow:
>
> 02/27/19 14:47:11 Initializing a VANILLA shadow for job 6070226.0
> 02/27/19 14:47:11 (6070226.0) (2132975): Request to run on slot1@CRUSH-SUGWG-OSG-10-5-228-85 <10.5.228.85:9618?addrs=10.5.228.85-9618&noUDP&sock=2273_aadb_3> was ACCEPTED
> 02/27/19 14:47:11 (6070226.0) (2132975): File transfer completed successfully.
> 02/27/19 14:47:11 (6070226.0) (2132975): WriteUserLog checking for event log rotation, but no lock
> 02/27/19 14:47:20 (6070226.0) (2132975): WriteUserLog checking for event log rotation, but no lock
> 02/27/19 14:52:21 (6070226.0) (2132975): WriteUserLog checking for event log rotation, but no lock
> 02/27/19 14:55:31 (6070226.0) (2132975): File transfer completed successfully.
> 02/27/19 17:41:15 (6070226.0) (2132975): WriteUserLog checking for event log rotation, but no lock
> 02/27/19 17:41:15 (6070226.0) (2132975): Job 6070226.0 going into Hold state (code 36,65280): Error from slot1@CRUSH-SUGWG-OSG-10-5-228-85: Job did not exit as promised when sent its checkpoint signal. Promised exit was with exit code 255, actual exit status was with exit code 255.
> 02/27/19 17:41:15 (6070226.0) (2132975): **** condor_shadow (condor_SHADOW) pid 2132975 EXITING WITH STATUS 112
>
> Submit file:
>
> universe = vanilla
> executable = testwrapper.sh
> arguments = ./testjob.sh
>
> output = testjob-$(cluster).out
> error = testjob-$(cluster).err
> log = testjob-$(cluster).log
>
> transfer_executable = True
> transfer_input_files = testjob.sh, my.input
> transfer_output_files = my.output, my.checkpoint, wrapper.checkpoint, wrapper.log
> when_to_transfer_output = ON_EXIT_OR_EVICT
>
> +CheckpointExitBySignal = False
> +CheckpointExitCode = 255
>
> +WantCheckpointSignal = True
> +WantFTOnCheckpoint = True
> +CheckpointSig = 10
>
> kill_sig = 10
>
> queue
>
> Relevant checkpointing code from executable:
>
> function checkpoint_trap {
> echo "checkpoint_trap function called, sending SIGTSTP to pid ${prog_pid}" &>> wrapper.log
> /bin/kill -s TSTP ${prog_pid} &>> wrapper.log
> exit 255
> }
>
> trap checkpoint_trap USR1
>
> Will keep debugging. Any ideas?
>
> Cheers,
> Duncan.
>
>> On Feb 15, 2019, at 6:08 PM, Todd L Miller <tlmiller@xxxxxxxxxxx> wrote:
>>
>>> Follow-up question: is there a way to set something like
>>>
>>> periodic_transfer_spool = 3600
>>>
>>> so that the contents of the job's spool directory can be transferred back to the shadow's spool periodically? In combination with ON_EXIT_OR_EVICT that would give me periodic checkpointing if the job dies unexpectedly, in addition to when it is cleanly evicted.
>>
>> We have some experimental (meaning, probably at least partially broken) features intended to support this kind of use case. They're both designed around the observation that HTCondor has no real way of knowing when it's safe to transfer the job sandbox if the job is still running, but that if you're creating checkpoints, your job is going to know how to restart from them.
>>
>> If you want the job to periodically checkpoint, you can request that HTCondor send a signal to it every so often; when it exits successfully, HTCondor performs file transfer (as if the job had been evicted), but instead of going back into the queue, HTCondor just restarts the job right where it was running.
>>
>> If the job generates checkpoints on its own, you can also configure HTCondor to recognize, for example, that when the job exits with code 88, that means to perform file transfer (as if the job had been evicted), and then restart the job right where it had been running.
>>
>> See the following page on our Wiki for details, and do please let me know if either feature works for you. Thanks.
>>
>> https://htcondor-wiki.cs.wisc.edu/index.cgi/wiki?p=ExperimentalSupportForPeriodicCheckpointingInVanillaUniverse
>>
>> - ToddM
>> _______________________________________________
>> HTCondor-users mailing list
>> To unsubscribe, send a message to htcondor-users-request@xxxxxxxxxxx with a
>> subject: Unsubscribe
>> You can also unsubscribe by visiting
>> https://lists.cs.wisc.edu/mailman/listinfo/htcondor-users
>>
>> The archives can be found at:
>> https://lists.cs.wisc.edu/archive/htcondor-users/
>
> --
>
> Duncan Brown Room 263-1, Physics Department
> Charles Brightman Professor of Physics Syracuse University, NY 13244
> http://dabrown.expressions.syr.edu Phone: 315 443 5993
>
>
> _______________________________________________
> HTCondor-users mailing list
> To unsubscribe, send a message to htcondor-users-request@xxxxxxxxxxx with a
> subject: Unsubscribe
> You can also unsubscribe by visiting
> https://lists.cs.wisc.edu/mailman/listinfo/htcondor-users
>
> The archives can be found at:
> https://lists.cs.wisc.edu/archive/htcondor-users/
--
Duncan Brown Room 263-1, Physics Department
Charles Brightman Professor of Physics Syracuse University, NY 13244
http://dabrown.expressions.syr.edu Phone: 315 443 5993