***For your information about my machine:
$CondorVersion: 7.8.0 May 09 2012 $
$CondorPlatform: x86_deb_5.0 $
OS: Ubuntu 12.04 LTS
MPI installation based on this tutorial:
https://help.ubuntu.com/community/MpichCluster
Install HTCondor on 2 PCs. One is as Central Manager, and the others as dedicated machine.
This job is work well:
universe = parallel
executable = /bin/sleep
arguments = 30
machine_count = 1
log = log
output = output
error = error
notification = never
should_transfer_files = always
when_to_transfer_output = on_exit
queue
##########################
***This is the facing problem:
The generated result from MPI job (in Output file) does not correspond as expected.
##########################
***mp1script:
#!/bin/sh
_CONDOR_PROCNO=$_CONDOR_PROCNO
_CONDOR_NPROCS=$_CONDOR_NPROCS
CONDOR_SSH=`condor_config_val libexec`
CONDOR_SSH=$CONDOR_SSH/condor_ssh
SSHD_SH=`condor_config_val libexec`
SSHD_SH=$SSHD_SH/sshd.sh
. $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS
# If not the head node, just sleep
forever, to let the
# sshds run
if [ $_CONDOR_PROCNO -ne 0 ]
then
wait
sshd_cleanup
fi
EXECUTABLE=$1
shift
# the binary is copied but the executable flag is cleared.
# so the script have to take care of this
chmod +x $EXECUTABLE
# Set this to the bin directory of MPICH installation
MPDIR=/usr/bin
PATH=$MPDIR:.:$PATH
export PATH
export P4_RSHCOMMAND=$CONDOR_SSH
CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact
export CONDOR_CONTACT_FILE
# The second field in the contact file is the machine name
# that condor_ssh knows how to use
sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $2}' > machines
## run the actual mpijob
mpirun -v -np $_CONDOR_NPROCS -machinefile machines $EXECUTABLE $@
sshd_cleanup
rm -f machines
exit
$?
##############################
***submission file:
universe = parallel
executable = mp1script
arguments = mpi_hello
machine_count = 1
log = logmpi
output = outputmpi
error = errormpi
should_transfer_files = yes
when_to_transfer_output = on_exit
transfer_input_files = mpi_hello
queue
################################
***Output file:
host: abcd18
==================================================================================================
mpiexec options:
----------------
Base path: /usr/bin/
Launcher: (null)
Debug level: 1
Enable X: -1
Global environment:
-------------------
_CONDOR_REMOTE_SPOOL_DIR=/var/lib/condor/spool/133/0/cluster133.proc0.subproc0
_CONDOR_MACHINE_AD=/var/lib/condor/execute/dir_3682/.machine.ad
_CONDOR_JOB_PIDS=
TMPDIR=/var/lib/condor/execute/dir_3682
_CONDOR_JOB_IWD=/var/lib/condor/execute/dir_3682
_CONDOR_PROCNO=mpi_hello
TEMP=/var/lib/condor/execute/dir_3682
_CONDOR_ANCESTOR_1010=3682:1374466809:3228693328
_CONDOR_NPROCS=
PATH=/usr/bin:.:/usr/bin:/sbin:/usr/sbin:/bin:/usr/bin
TMP=/var/lib/condor/execute/dir_3682
_CONDOR_ANCESTOR_975=1010:1374140584:1924940060
_CONDOR_SLOT=slot1
CONDOR_CONFIG=/etc/condor/condor_config
CONDOR_CONTACT_FILE=/var/lib/condor/execute/dir_3682/contact
P4_RSHCOMMAND=/usr/lib/condor/libexec/condor_ssh
PWD=/var/lib/condor/execute/dir_3682
_CONDOR_ANCESTOR_3682=3686:1374466815:2964468586
_CONDOR_JOB_AD=/var/lib/condor/execute/dir_3682/.job.ad
_CONDOR_SCRATCH_DIR=/var/lib/condor/execute/dir_3682
Hydra internal environment:
---------------------------
GFORTRAN_UNBUFFERED_PRECONNECTED=y
Proxy information:
*********************
[1] proxy: abcd18 (1 cores)
Exec list: machines (0 processes);
==================================================================================================
[mpiexec@abcd18] Timeout set to -1 (-1 means infinite)
###########################
***Error file:
/var/lib/condor/execute/dir_3682/condor_exec.exe: 125: [: Illegal number: mpi_hello
/var/lib/condor/execute/dir_3682/condor_exec.exe: 37: [: Illegal number:
mpi_hello
/var/lib/condor/execute/dir_3682/condor_exec.exe: 62: /var/lib/condor/execute/dir_3682/condor_exec.exe: cannot open /var/lib/condor/execute/dir_3682/contact: No such file
[mpiexec@abcd18] HYD_pmcd_pmi_alloc_pg_scratch (./pm/pmiserv/pmiserv_utils.c:595): assert (pg->pg_process_count * sizeof(struct HYD_pmcd_pmi_ecount)) failed
[mpiexec@abcd18] HYD_pmci_launch_procs (./pm/pmiserv/pmiserv_pmci.c:103): error allocating pg scratch space
[mpiexec@abcd18] main (./ui/mpich/mpiexec.c:401): process manager returned error launching processes
###############################
I do not have any idea how to overcome
this problem and I have tried to find the same problem through search engine which maybe faced by other users, but return me no related result.
Any help is very appreciated.