Mailing List Archives
Authenticated access
|
|
|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Condor-users] parallel MPI job runs from command line, but not in condor?
- Date: Fri, 12 Sep 2008 14:07:45 -0500
- From: "Wingard, Jeffrey" <jwingard@xxxxxxxxx>
- Subject: [Condor-users] parallel MPI job runs from command line, but not in condor?
I have an MPI job
that I can run from the command line using the following
line
/opt/scali/bin/mpirun -v -np 4 -machinefile condor_machines
/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
PARALLEL4
this works with no
problems. However when using the following submit file, the job aborts right
away. I have listed the submit file and the output from using "set -x"
in
the mpi wrapper
shell (mpiscript).
Does anyone have any
ideas?
Thanks
Jeff
universe =
parallel
machine_count = 4
executable = mpiscript
case_name =
PARALLEL4
run_number = 46
deck_name = $(case_name)
simulator_executable
= /usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
environment
= "LD_LIBRARY_PATH=/opt/scali/lib64"
arguments = $(simulator_executable)
$(deck_name)
output = PARALLEL.RUNLOG
error =
PARALLEL_$(Node).CONDOR.ERROR
log =
PARALLEL.CONDOR.LOG
transfer_input_files = $(simulator_executable),
$(case_name).DATA
should_transfer_files = YES
when_to_transfer_output =
ON_EXIT_OR_EVICT
Requirements = ( LoadAvg < 0.10 ) && (
eclipse_available > 0 )
Rank = KFlops
+AccountingGroup =
"REB_018925"
Queue
I have a "set -x" in
the mpiscript wrapper and the output follows
[jwingard@dmvx reb]$
cat PARALLEL.CONDOR.ERROR
+ _CONDOR_PROCNO=0
+ _CONDOR_NPROCS=4
++
condor_config_val libexec
+ CONDOR_SSH=/usr/local/condor/libexec
+
CONDOR_SSH=/usr/local/condor/libexec/condor_ssh
++ condor_config_val
libexec
+ SSHD_SH=/usr/local/condor/libexec
+
SSHD_SH=/usr/local/condor/libexec/sshd.sh
+ .
/usr/local/condor/libexec/sshd.sh 0 4
++ trap sshd_cleanup 15
+++
condor_config_val CONDOR_SSHD
++ SSHD=/usr/sbin/sshd
+++ condor_config_val
CONDOR_SSH_KEYGEN
++ KEYGEN=/usr/bin/ssh-keygen
+++ condor_config_val
libexec
++ CONDOR_CHIRP=/usr/local/condor/libexec
++
CONDOR_CHIRP=/usr/local/condor/libexec/condor_chirp
++ PORT=4444
++
_CONDOR_REMOTE_SPOOL_DIR=/data/condor/spool/cluster376.proc0.subproc0
++
_CONDOR_PROCNO=0
++ _CONDOR_NPROCS=4
++ mkdir
/data/condor/execute/dir_10743/tmp
++
hostkey=/data/condor/execute/dir_10743/tmp/hostkey
++ /bin/rm -f
/data/condor/execute/dir_10743/tmp/hostkey
/data/condor/execute/dir_10743/tmp/hostkey.pub
++ /usr/bin/ssh-keygen -q -f
/data/condor/execute/dir_10743/tmp/hostkey -t rsa -N ''
++ '[' 0 -ne 0
']'
++ idkey=/data/condor/execute/dir_10743/tmp/0.key
++
/usr/bin/ssh-keygen -q -f /data/condor/execute/dir_10743/tmp/0.key -t rsa -N
''
++ '[' 0 -ne 0 ']'
++ /usr/local/condor/libexec/condor_chirp put -perm
0700 /data/condor/execute/dir_10743/tmp/0.key
/data/condor/spool/cluster376.proc0.subproc0/0.key
++ '[' 0 -ne 0 ']'
++
done=0
++ '[' 0 -eq 0 ']'
++ /usr/sbin/sshd -p4444
-oAuthorizedKeysFile=/data/condor/execute/dir_10743/tmp/0.key.pub
-h/data/condor/execute/dir_10743/tmp/hostkey -De -f/dev/null -oStrictModes=no
-oPidFile=/dev/null -oAcceptEnv=_CONDOR
++ pid=10766
++ sleep 2
++ grep
'Server listening' sshd.out
++ done=1
++ '[' 1 -eq 0 ']'
++ /bin/rm
sshd.out
+++ hostname
++ hostname=c02.vxnet
+++ pwd
++
currentDir=/c02/condor/execute/dir_10743
+++ whoami
++ user=jwingard
++
echo '0 c02.vxnet 4444 jwingard /c02/condor/execute/dir_10743'
++
/usr/local/condor/libexec/condor_chirp put -mode cwa -
/data/condor/spool/cluster376.proc0.subproc0/contact
++ '[' 0 -ne 0 ']'
++
'[' 0 -eq 0 ']'
++ done=0
++ '[' 0 -eq 0 ']'
++ /bin/rm -f
contact
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/contact
/data/condor/execute/dir_10743/contact
+++ wc -l
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++
lines=1
++ '[' 1 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm
-f contact
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/contact
/data/condor/execute/dir_10743/contact
+++ wc -l
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++
lines=3
++ '[' 3 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm
-f contact
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/contact
/data/condor/execute/dir_10743/contact
+++ wc -l
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++
lines=3
++ '[' 3 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm
-f contact
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/contact
/data/condor/execute/dir_10743/contact
+++ wc -l
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++
lines=4
++ '[' 4 -eq 4 ']'
++ done=1
++ node=0
++ '[' 0 -ne 4
']'
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/0.key
/data/condor/execute/dir_10743/tmp/0.key
++
/usr/local/condor/libexec/condor_chirp remove
/data/condor/spool/cluster376.proc0.subproc0/0.key
+++ expr 0 + 1
++
node=1
++ '[' 1 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/1.key
/data/condor/execute/dir_10743/tmp/1.key
++
/usr/local/condor/libexec/condor_chirp remove
/data/condor/spool/cluster376.proc0.subproc0/1.key
+++ expr 1 + 1
++
node=2
++ '[' 2 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/2.key
/data/condor/execute/dir_10743/tmp/2.key
++
/usr/local/condor/libexec/condor_chirp remove
/data/condor/spool/cluster376.proc0.subproc0/2.key
+++ expr 2 + 1
++
node=3
++ '[' 3 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch
/data/condor/spool/cluster376.proc0.subproc0/3.key
/data/condor/execute/dir_10743/tmp/3.key
++
/usr/local/condor/libexec/condor_chirp remove
/data/condor/spool/cluster376.proc0.subproc0/3.key
+++ expr 3 + 1
++
node=4
++ '[' 4 -ne 4 ']'
++ chmod 0700
/data/condor/execute/dir_10743/tmp/0.key
/data/condor/execute/dir_10743/tmp/1.key
/data/condor/execute/dir_10743/tmp/2.key
/data/condor/execute/dir_10743/tmp/3.key
++
/usr/local/condor/libexec/condor_chirp remove
/data/condor/spool/cluster376.proc0.subproc0/contact
++ '[' 1 -eq 0 ']'
+
'[' 0 -ne 0 ']'
+
EXECUTABLE=/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
+
shift
+ chmod +x
/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
chmod:
changing permissions of
`/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe': Operation not
permitted
+ MPDIR=/opt/scali/bin
+
PATH=/opt/scali/bin:.:/usr/local/condor/bin:/sbin:/usr/sbin:/bin:/usr/bin:/usr/X11R6/bin
+
export PATH
+ export P4_RSHCOMMAND=/usr/local/condor/libexec/condor_ssh
+
P4_RSHCOMMAND=/usr/local/condor/libexec/condor_ssh
+
CONDOR_CONTACT_FILE=/data/condor/execute/dir_10743/contact
+ export
CONDOR_CONTACT_FILE
+ sort -n +0
+ awk '{print $2}'
+ mpirun -v -np 4
-machinefile condor_machines
/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe PARALLEL4
***
glibc detected *** malloc(): memory corruption: 0x0000000000505700
***
/opt/scali/bin/mpirun: line 646: 10885
Aborted
$MPIMON $MPIMON_OPTS $_PROGRAM $_PROGOPTS -- $RUN_LIST
+ sshd_cleanup
+
/bin/rm -f /data/condor/execute/dir_10743/tmp/hostkey
/data/condor/execute/dir_10743/tmp/hostkey.pub
/data/condor/execute/dir_10743/tmp/0.key
/data/condor/execute/dir_10743/tmp/0.key.pub sshd.out
/data/condor/execute/dir_10743/contact
+ exit 0