I submit a job using the following
file.
Requirements = (Arch == "x86_64") && (OpSys
== "LINUX")
Output = out.$(Process)
Queue 5 condor_q shows the files as sitting idle
indefinately.
and condor_q -ana shows the following
011.004: Run analysis summary. Of 42
machines,
0 are rejected by your job's requirements 0 reject your job because of their own requirements 0 match, but are serving users with a better priority in the pool 42 match, match, but reject the job for unknown reasons 0 match, but will not currently preempt their existing job 0 are available to run your job The SchedLog is complaining about
swap space. All the nodes in my cluster
have no swap space but large amounts
of physical memory so like suggested I
set RESERVED_SWAP to 0,
condor_config_val reports this as set correctly.
But the problem is still
persisting.
None of the other logs are reporting
any information relevent or at the same time.
Any ideas on this?
Chris
-----------------------------------------------------------------------------
SchedLog
----------------------------------------------------------------------------- .....
9/23 17:51:23 Sent ad to central manager for chris@xxxxxxxxxxx 9/23 17:55:08 Activity on stashed negotiator socket 9/23 17:55:08 Negotiating for owner: chris@xxxxxxxxxxx 9/23 17:55:08 Checking consistency running and runnable jobs 9/23 17:55:08 Tables are consistent 9/23 17:55:08 Swap space estimate reached! No more jobs can be run! 9/23 17:55:08 Solution: get more swap space, or set RESERVED_SWAP = 0 9/23 17:55:08 0 jobs matched, 5 jobs idle -----------------------------------------------------------------------------
condor_config (with comments stripped)
----------------------------------------------------------------------------- CONDOR_HOST = thebeast RELEASE_DIR = /home/condor/release LOCAL_DIR = $(TILDE)/hosts/$(HOSTNAME) LOCAL_CONFIG_FILE = $(LOCAL_DIR)/condor_config.local CONDOR_ADMIN = condor@xxxxxxxxxxxxxxxxxxxx MAIL = /usr/bin/mail UID_DOMAIN = $(FULL_HOSTNAME) #FILESYSTEM_DOMAIN = cluster.int FILESYSTEM_DOMAIN = $(FULL_HOSTNAME) FLOCK_FROM = FLOCK_TO = FLOCK_NEGOTIATOR_HOSTS = $(FLOCK_TO) FLOCK_COLLECTOR_HOSTS = $(FLOCK_TO) HOSTALLOW_ADMINISTRATOR = * HOSTALLOW_OWNER = $(FULL_HOSTNAME), $(HOSTALLOW_ADMINISTRATOR) HOSTALLOW_READ = * HOSTALLOW_WRITE = * HOSTALLOW_NEGOTIATOR = $(NEGOTIATOR_HOST) HOSTALLOW_NEGOTIATOR_SCHEDD = $(NEGOTIATOR_HOST), $(FLOCK_NEGOTIATOR_HOSTS) HOSTALLOW_WRITE_COLLECTOR = $(HOSTALLOW_WRITE), $(FLOCK_FROM) HOSTALLOW_WRITE_STARTD = $(HOSTALLOW_WRITE), $(FLOCK_FROM) HOSTALLOW_READ_COLLECTOR = $(HOSTALLOW_READ), $(FLOCK_FROM) HOSTALLOW_READ_STARTD = $(HOSTALLOW_READ), $(FLOCK_FROM) RESERVED_SWAP = 0 LOCK = /var/lock/condor GLIDEIN_SERVER_NAME = gridftp.cs.wisc.edu GLIDEIN_SERVER_DIR = /p/condor/public/binaries/glidein ALL_DEBUG = MAX_COLLECTOR_LOG = 1000000 COLLECTOR_DEBUG = MAX_KBDD_LOG = 1000000 KBDD_DEBUG = MAX_NEGOTIATOR_LOG = 1000000 NEGOTIATOR_DEBUG = D_MATCH MAX_NEGOTIATOR_MATCH_LOG = 1000000 MAX_SCHEDD_LOG = 1000000 SCHEDD_DEBUG = D_FULLDEBUG MAX_SHADOW_LOG = 1000000 SHADOW_DEBUG = MAX_STARTD_LOG = 1000000 STARTD_DEBUG = D_COMMAND MAX_STARTER_LOG = 1000000 STARTER_DEBUG = D_NODATE MAX_MASTER_LOG = 1000000 MASTER_DEBUG = D_COMMAND MINUTE = 60 HOUR = (60 * $(MINUTE)) StateTimer = (CurrentTime - EnteredCurrentState) ActivityTimer = (CurrentTime - EnteredCurrentActivity) ActivationTimer = (CurrentTime - JobStart) LastCkpt = (CurrentTime - LastPeriodicCheckpoint) STANDARD = 1 PVM = 4 VANILLA = 5 MPI = 8 IsPVM = (TARGET.JobUniverse == $(PVM)) IsMPI = (TARGET.JobUniverse == $(MPI)) IsVanilla = (TARGET.JobUniverse == $(VANILLA)) IsStandard = (TARGET.JobUniverse == $(STANDARD)) SmallJob = (TARGET.ImageSize < (15 * 1024)) NonCondorLoadAvg = (LoadAvg - CondorLoadAvg) BackgroundLoad = 0.3 HighLoad = 0.5 StartIdleTime = 15 * $(MINUTE) ContinueIdleTime = 5 * $(MINUTE) MaxSuspendTime = 10 * $(MINUTE) MaxVacateTime = 10 * $(MINUTE) KeyboardBusy = (KeyboardIdle < $(MINUTE)) ConsoleBusy = (ConsoleIdle < $(MINUTE)) CPUIdle = ($(NonCondorLoadAvg) <= $(BackgroundLoad)) CPUBusy = ($(NonCondorLoadAvg) >= $(HighLoad)) KeyboardNotBusy = ($(KeyboardBusy) == False) BigJob = (TARGET.ImageSize >= (50 * 1024)) MediumJob = (TARGET.ImageSize >= (15 * 1024) && TARGET.ImageSize < (50 * 1024)) SmallJob = (TARGET.ImageSize < (15 * 1024)) JustCPU = ($(CPUBusy) && ($(KeyboardBusy) == False)) MachineBusy = ($(CPUBusy) || $(KeyboardBusy)) WANT_SUSPEND = $(TESTING_WANT_SUSPEND) WANT_VACATE = $(TESTING_WANT_VACATE) START = $(TESTING_START) SUSPEND = $(TESTING_SUSPEND) CONTINUE = $(TESTING_CONTINUE) PREEMPT = $(TESTING_PREEMPT) KILL = $(TESTING_KILL) PERIODIC_CHECKPOINT = $(TESTING_PERIODIC_CHECKPOINT) PREEMPTION_REQUIREMENTS = $(TESTING_PREEMPTION_REQUIREMENTS) PREEMPTION_RANK = $(TESTING_PREEMPTION_RANK) TESTINGMODE_WANT_SUSPEND = False TESTINGMODE_WANT_VACATE = False TESTINGMODE_START = True TESTINGMODE_SUSPEND = False TESTINGMODE_CONTINUE = True TESTINGMODE_PREEMPT = False TESTINGMODE_KILL = False TESTINGMODE_PERIODIC_CHECKPOINT = False TESTINGMODE_PREEMPTION_REQUIREMENTS = False TESTINGMODE_PREEMPTION_RANK = 0 LOG = $(LOCAL_DIR)/log SPOOL = $(LOCAL_DIR)/spool EXECUTE = $(LOCAL_DIR)/execute BIN = $(RELEASE_DIR)/bin LIB = $(RELEASE_DIR)/lib SBIN = $(RELEASE_DIR)/sbin HISTORY = $(SPOOL)/history COLLECTOR_LOG = $(LOG)/CollectorLog KBDD_LOG = $(LOG)/KbdLog MASTER_LOG = $(LOG)/MasterLog NEGOTIATOR_LOG = $(LOG)/NegotiatorLog NEGOTIATOR_MATCH_LOG = $(LOG)/MatchLog SCHEDD_LOG = $(LOG)/SchedLog SHADOW_LOG = $(LOG)/ShadowLog STARTD_LOG = $(LOG)/StartLog STARTER_LOG = $(LOG)/StarterLog SHADOW_LOCK = $(LOCK)/ShadowLock COLLECTOR_HOST = $(CONDOR_HOST) NEGOTIATOR_HOST = $(CONDOR_HOST) RESERVED_DISK = 5 DAEMON_LIST = MASTER, STARTD, SCHEDD MASTER = $(SBIN)/condor_master STARTD = $(SBIN)/condor_startd SCHEDD = $(SBIN)/condor_schedd KBDD = $(SBIN)/condor_kbdd NEGOTIATOR = $(SBIN)/condor_negotiator COLLECTOR = $(SBIN)/condor_collector GRID_MONITOR = $(SBIN)/grid_monitor.sh MASTER_ADDRESS_FILE = $(LOG)/.master_address PREEN = $(SBIN)/condor_preen PREEN_ARGS = -m -r STARTER_LIST = STARTER, STARTER_PVM, STARTER_STANDARD STARTER = $(SBIN)/condor_starter STARTER_PVM = $(SBIN)/condor_starter.pvm STARTER_STANDARD = $(SBIN)/condor_starter.std STARTD_ADDRESS_FILE = $(LOG)/.startd_address BenchmarkTimer = (CurrentTime - LastBenchmark) RunBenchmarks : (LastBenchmark == 0 ) || ($(BenchmarkTimer) >= (4 * $(HOUR))) CONSOLE_DEVICES = mouse, console COLLECTOR_HOST_STRING = "$(COLLECTOR_HOST)" STARTD_EXPRS = COLLECTOR_HOST_STRING STARTD_JOB_EXPRS = ImageSize, ExecutableSize, JobUniverse, NiceUser SHADOW_LIST = SHADOW, SHADOW_PVM, SHADOW_STANDARD SHADOW = $(SBIN)/condor_shadow SHADOW_PVM = $(SBIN)/condor_shadow.pvm SHADOW_STANDARD = $(SBIN)/condor_shadow.std SCHEDD_ADDRESS_FILE = $(LOG)/.schedd_address SHADOW_SIZE_ESTIMATE = 1800 SHADOW_RENICE_INCREMENT = 10 PERIODIC_EXPR_INTERVAL = 60 QUEUE_SUPER_USERS = root, condor PVMD = $(SBIN)/condor_pvmd PVMGS = $(SBIN)/condor_pvmgs VALID_SPOOL_FILES = job_queue.log, job_queue.log.tmp, history, \ Accountant.log, Accountantnew.log INVALID_LOG_FILES = core JAVA = JAVA_MAXHEAP_ARGUMENT = JAVA_CLASSPATH_DEFAULT = $(LIB) $(LIB)/scimark2lib.jar . JAVA_CLASSPATH_ARGUMENT = -classpath JAVA_CLASSPATH_SEPARATOR = : JAVA_BENCHMARK_TIME = 2 JAVA_EXTRA_ARGUMENTS = GRIDMANAGER = $(SBIN)/condor_gridmanager GAHP = $(SBIN)/gahp_server MAX_GRIDMANAGER_LOG = 1000000 GRIDMANAGER_DEBUG = D_COMMAND GRIDMANAGER_LOG = /tmp/GridmanagerLog.$(USERNAME) CRED_MIN_TIME_LEFT = 120 - |