Mailing List Archives
	Authenticated access
	
	
     | 
    
	 
	 
     | 
    
	
	 
     | 
  
 
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Condor-users] parallel MPI job runs from command line,	but not in condor?
- Date: Fri, 12 Sep 2008 14:07:45 -0500
 
- From: "Wingard, Jeffrey" <jwingard@xxxxxxxxx>
 
- Subject: [Condor-users] parallel MPI job runs from command line,	but not in condor?
 
I have an MPI job 
that I can run from the command line using the following 
line
 
/opt/scali/bin/mpirun -v -np 4 -machinefile condor_machines 
/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe 
PARALLEL4
 
this works with no 
problems. However when using the following submit file, the job aborts right 
away. I have listed the submit file and the output from using "set -x" 
in
the mpi wrapper 
shell (mpiscript).
 
Does anyone have any 
ideas?
 
Thanks
Jeff
 
 
universe = 
parallel
machine_count = 4
executable = mpiscript
case_name = 
PARALLEL4
run_number = 46
deck_name = $(case_name)
simulator_executable 
= /usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
environment 
= "LD_LIBRARY_PATH=/opt/scali/lib64"
arguments = $(simulator_executable) 
$(deck_name)
output = PARALLEL.RUNLOG
error = 
PARALLEL_$(Node).CONDOR.ERROR
log = 
PARALLEL.CONDOR.LOG
transfer_input_files = $(simulator_executable), 
$(case_name).DATA
should_transfer_files = YES
when_to_transfer_output = 
ON_EXIT_OR_EVICT
Requirements = ( LoadAvg < 0.10 ) && ( 
eclipse_available > 0 )
Rank = KFlops
+AccountingGroup = 
"REB_018925"
Queue
 
I have a "set -x" in 
the mpiscript wrapper and the output follows
 
[jwingard@dmvx reb]$ 
cat PARALLEL.CONDOR.ERROR
+ _CONDOR_PROCNO=0
+ _CONDOR_NPROCS=4
++ 
condor_config_val libexec
+ CONDOR_SSH=/usr/local/condor/libexec
+ 
CONDOR_SSH=/usr/local/condor/libexec/condor_ssh
++ condor_config_val 
libexec
+ SSHD_SH=/usr/local/condor/libexec
+ 
SSHD_SH=/usr/local/condor/libexec/sshd.sh
+ . 
/usr/local/condor/libexec/sshd.sh 0 4
++ trap sshd_cleanup 15
+++ 
condor_config_val CONDOR_SSHD
++ SSHD=/usr/sbin/sshd
+++ condor_config_val 
CONDOR_SSH_KEYGEN
++ KEYGEN=/usr/bin/ssh-keygen
+++ condor_config_val 
libexec
++ CONDOR_CHIRP=/usr/local/condor/libexec
++ 
CONDOR_CHIRP=/usr/local/condor/libexec/condor_chirp
++ PORT=4444
++ 
_CONDOR_REMOTE_SPOOL_DIR=/data/condor/spool/cluster376.proc0.subproc0
++ 
_CONDOR_PROCNO=0
++ _CONDOR_NPROCS=4
++ mkdir 
/data/condor/execute/dir_10743/tmp
++ 
hostkey=/data/condor/execute/dir_10743/tmp/hostkey
++ /bin/rm -f 
/data/condor/execute/dir_10743/tmp/hostkey 
/data/condor/execute/dir_10743/tmp/hostkey.pub
++ /usr/bin/ssh-keygen -q -f 
/data/condor/execute/dir_10743/tmp/hostkey -t rsa -N ''
++ '[' 0 -ne 0 
']'
++ idkey=/data/condor/execute/dir_10743/tmp/0.key
++ 
/usr/bin/ssh-keygen -q -f /data/condor/execute/dir_10743/tmp/0.key -t rsa -N 
''
++ '[' 0 -ne 0 ']'
++ /usr/local/condor/libexec/condor_chirp put -perm 
0700 /data/condor/execute/dir_10743/tmp/0.key 
/data/condor/spool/cluster376.proc0.subproc0/0.key
++ '[' 0 -ne 0 ']'
++ 
done=0
++ '[' 0 -eq 0 ']'
++ /usr/sbin/sshd -p4444 
-oAuthorizedKeysFile=/data/condor/execute/dir_10743/tmp/0.key.pub 
-h/data/condor/execute/dir_10743/tmp/hostkey -De -f/dev/null -oStrictModes=no 
-oPidFile=/dev/null -oAcceptEnv=_CONDOR
++ pid=10766
++ sleep 2
++ grep 
'Server listening' sshd.out
++ done=1
++ '[' 1 -eq 0 ']'
++ /bin/rm 
sshd.out
+++ hostname
++ hostname=c02.vxnet
+++ pwd
++ 
currentDir=/c02/condor/execute/dir_10743
+++ whoami
++ user=jwingard
++ 
echo '0 c02.vxnet 4444 jwingard /c02/condor/execute/dir_10743'
++ 
/usr/local/condor/libexec/condor_chirp put -mode cwa - 
/data/condor/spool/cluster376.proc0.subproc0/contact
++ '[' 0 -ne 0 ']'
++ 
'[' 0 -eq 0 ']'
++ done=0
++ '[' 0 -eq 0 ']'
++ /bin/rm -f 
contact
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/contact 
/data/condor/execute/dir_10743/contact
+++ wc -l 
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ 
lines=1
++ '[' 1 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm 
-f contact
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/contact 
/data/condor/execute/dir_10743/contact
+++ wc -l 
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ 
lines=3
++ '[' 3 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm 
-f contact
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/contact 
/data/condor/execute/dir_10743/contact
+++ wc -l 
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ 
lines=3
++ '[' 3 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm 
-f contact
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/contact 
/data/condor/execute/dir_10743/contact
+++ wc -l 
/data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ 
lines=4
++ '[' 4 -eq 4 ']'
++ done=1
++ node=0
++ '[' 0 -ne 4 
']'
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/0.key 
/data/condor/execute/dir_10743/tmp/0.key
++ 
/usr/local/condor/libexec/condor_chirp remove 
/data/condor/spool/cluster376.proc0.subproc0/0.key
+++ expr 0 + 1
++ 
node=1
++ '[' 1 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/1.key 
/data/condor/execute/dir_10743/tmp/1.key
++ 
/usr/local/condor/libexec/condor_chirp remove 
/data/condor/spool/cluster376.proc0.subproc0/1.key
+++ expr 1 + 1
++ 
node=2
++ '[' 2 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/2.key 
/data/condor/execute/dir_10743/tmp/2.key
++ 
/usr/local/condor/libexec/condor_chirp remove 
/data/condor/spool/cluster376.proc0.subproc0/2.key
+++ expr 2 + 1
++ 
node=3
++ '[' 3 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch 
/data/condor/spool/cluster376.proc0.subproc0/3.key 
/data/condor/execute/dir_10743/tmp/3.key
++ 
/usr/local/condor/libexec/condor_chirp remove 
/data/condor/spool/cluster376.proc0.subproc0/3.key
+++ expr 3 + 1
++ 
node=4
++ '[' 4 -ne 4 ']'
++ chmod 0700 
/data/condor/execute/dir_10743/tmp/0.key 
/data/condor/execute/dir_10743/tmp/1.key 
/data/condor/execute/dir_10743/tmp/2.key 
/data/condor/execute/dir_10743/tmp/3.key
++ 
/usr/local/condor/libexec/condor_chirp remove 
/data/condor/spool/cluster376.proc0.subproc0/contact
++ '[' 1 -eq 0 ']'
+ 
'[' 0 -ne 0 ']'
+ 
EXECUTABLE=/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
+ 
shift
+ chmod +x 
/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
chmod: 
changing permissions of 
`/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe': Operation not 
permitted
+ MPDIR=/opt/scali/bin
+ 
PATH=/opt/scali/bin:.:/usr/local/condor/bin:/sbin:/usr/sbin:/bin:/usr/bin:/usr/X11R6/bin
+ 
export PATH
+ export P4_RSHCOMMAND=/usr/local/condor/libexec/condor_ssh
+ 
P4_RSHCOMMAND=/usr/local/condor/libexec/condor_ssh
+ 
CONDOR_CONTACT_FILE=/data/condor/execute/dir_10743/contact
+ export 
CONDOR_CONTACT_FILE
+ sort -n +0
+ awk '{print $2}'
+ mpirun -v -np 4 
-machinefile condor_machines 
/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe PARALLEL4
*** 
glibc detected *** malloc(): memory corruption: 0x0000000000505700 
***
/opt/scali/bin/mpirun: line 646: 10885 
Aborted                 
$MPIMON $MPIMON_OPTS $_PROGRAM $_PROGOPTS -- $RUN_LIST
+ sshd_cleanup
+ 
/bin/rm -f /data/condor/execute/dir_10743/tmp/hostkey 
/data/condor/execute/dir_10743/tmp/hostkey.pub 
/data/condor/execute/dir_10743/tmp/0.key 
/data/condor/execute/dir_10743/tmp/0.key.pub sshd.out 
/data/condor/execute/dir_10743/contact
+ exit 0