[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] Idle Jobs - 42 match, match, but reject the job for unknown reasons



I submit a job using the following file.
 
chris@thebeast:~/condortest> cat test.sub
Executable = /bin/hostname
Universe = standard
Requirements = (Arch == "x86_64") && (OpSys == "LINUX")
Output  = out.$(Process)
Queue 5
 
condor_q shows the files as sitting idle indefinately.
 
and condor_q -ana shows the following
 
011.004:  Run analysis summary.  Of 42 machines,
      0 are rejected by your job's requirements
      0 reject your job because of their own requirements
      0 match, but are serving users with a better priority in the pool
     42 match, match, but reject the job for unknown reasons
      0 match, but will not currently preempt their existing job
      0 are available to run your job
 
 
The SchedLog is complaining about swap space. All the nodes in my cluster
have no swap space but large amounts of physical memory so like suggested I
set RESERVED_SWAP to 0, condor_config_val reports this as set correctly.
But the problem is still persisting.
 
None of the other logs are reporting any information relevent or at the same time.
 
Any ideas on this?
 
Chris
 
-----------------------------------------------------------------------------
SchedLog
-----------------------------------------------------------------------------
.....

9/23 17:51:23 Sent ad to central manager for chris@xxxxxxxxxxx

9/23 17:55:08 Activity on stashed negotiator socket

9/23 17:55:08 Negotiating for owner: chris@xxxxxxxxxxx

9/23 17:55:08 Checking consistency running and runnable jobs

9/23 17:55:08 Tables are consistent

9/23 17:55:08 Swap space estimate reached! No more jobs can be run!

9/23 17:55:08 Solution: get more swap space, or set RESERVED_SWAP = 0

9/23 17:55:08 0 jobs matched, 5 jobs idle

-----------------------------------------------------------------------------
condor_config (with comments stripped)
-----------------------------------------------------------------------------

CONDOR_HOST = thebeast

RELEASE_DIR = /home/condor/release

LOCAL_DIR = $(TILDE)/hosts/$(HOSTNAME)

LOCAL_CONFIG_FILE = $(LOCAL_DIR)/condor_config.local

CONDOR_ADMIN = condor@xxxxxxxxxxxxxxxxxxxx

MAIL = /usr/bin/mail

UID_DOMAIN = $(FULL_HOSTNAME)

#FILESYSTEM_DOMAIN = cluster.int

FILESYSTEM_DOMAIN = $(FULL_HOSTNAME)

FLOCK_FROM =

FLOCK_TO =

FLOCK_NEGOTIATOR_HOSTS = $(FLOCK_TO)

FLOCK_COLLECTOR_HOSTS = $(FLOCK_TO)

HOSTALLOW_ADMINISTRATOR = *

HOSTALLOW_OWNER = $(FULL_HOSTNAME), $(HOSTALLOW_ADMINISTRATOR)

HOSTALLOW_READ = *

HOSTALLOW_WRITE = *

HOSTALLOW_NEGOTIATOR = $(NEGOTIATOR_HOST)

HOSTALLOW_NEGOTIATOR_SCHEDD = $(NEGOTIATOR_HOST), $(FLOCK_NEGOTIATOR_HOSTS)

HOSTALLOW_WRITE_COLLECTOR = $(HOSTALLOW_WRITE), $(FLOCK_FROM)

HOSTALLOW_WRITE_STARTD = $(HOSTALLOW_WRITE), $(FLOCK_FROM)

HOSTALLOW_READ_COLLECTOR = $(HOSTALLOW_READ), $(FLOCK_FROM)

HOSTALLOW_READ_STARTD = $(HOSTALLOW_READ), $(FLOCK_FROM)

RESERVED_SWAP = 0

LOCK = /var/lock/condor

GLIDEIN_SERVER_NAME = gridftp.cs.wisc.edu

GLIDEIN_SERVER_DIR = /p/condor/public/binaries/glidein

ALL_DEBUG =

MAX_COLLECTOR_LOG = 1000000

COLLECTOR_DEBUG =

MAX_KBDD_LOG = 1000000

KBDD_DEBUG =

MAX_NEGOTIATOR_LOG = 1000000

NEGOTIATOR_DEBUG = D_MATCH

MAX_NEGOTIATOR_MATCH_LOG = 1000000

MAX_SCHEDD_LOG = 1000000

SCHEDD_DEBUG = D_FULLDEBUG

MAX_SHADOW_LOG = 1000000

SHADOW_DEBUG =

MAX_STARTD_LOG = 1000000

STARTD_DEBUG = D_COMMAND

MAX_STARTER_LOG = 1000000

STARTER_DEBUG = D_NODATE

MAX_MASTER_LOG = 1000000

MASTER_DEBUG = D_COMMAND

MINUTE = 60

HOUR = (60 * $(MINUTE))

StateTimer = (CurrentTime - EnteredCurrentState)

ActivityTimer = (CurrentTime - EnteredCurrentActivity)

ActivationTimer = (CurrentTime - JobStart)

LastCkpt = (CurrentTime - LastPeriodicCheckpoint)

STANDARD = 1

PVM = 4

VANILLA = 5

MPI = 8

IsPVM = (TARGET.JobUniverse == $(PVM))

IsMPI = (TARGET.JobUniverse == $(MPI))

IsVanilla = (TARGET.JobUniverse == $(VANILLA))

IsStandard = (TARGET.JobUniverse == $(STANDARD))

SmallJob = (TARGET.ImageSize < (15 * 1024))

NonCondorLoadAvg = (LoadAvg - CondorLoadAvg)

BackgroundLoad = 0.3

HighLoad = 0.5

StartIdleTime = 15 * $(MINUTE)

ContinueIdleTime = 5 * $(MINUTE)

MaxSuspendTime = 10 * $(MINUTE)

MaxVacateTime = 10 * $(MINUTE)

KeyboardBusy = (KeyboardIdle < $(MINUTE))

ConsoleBusy = (ConsoleIdle < $(MINUTE))

CPUIdle = ($(NonCondorLoadAvg) <= $(BackgroundLoad))

CPUBusy = ($(NonCondorLoadAvg) >= $(HighLoad))

KeyboardNotBusy = ($(KeyboardBusy) == False)

BigJob = (TARGET.ImageSize >= (50 * 1024))

MediumJob = (TARGET.ImageSize >= (15 * 1024) && TARGET.ImageSize < (50 * 1024))

SmallJob = (TARGET.ImageSize < (15 * 1024))

JustCPU = ($(CPUBusy) && ($(KeyboardBusy) == False))

MachineBusy = ($(CPUBusy) || $(KeyboardBusy))

WANT_SUSPEND = $(TESTING_WANT_SUSPEND)

WANT_VACATE = $(TESTING_WANT_VACATE)

START = $(TESTING_START)

SUSPEND = $(TESTING_SUSPEND)

CONTINUE = $(TESTING_CONTINUE)

PREEMPT = $(TESTING_PREEMPT)

KILL = $(TESTING_KILL)

PERIODIC_CHECKPOINT = $(TESTING_PERIODIC_CHECKPOINT)

PREEMPTION_REQUIREMENTS = $(TESTING_PREEMPTION_REQUIREMENTS)

PREEMPTION_RANK = $(TESTING_PREEMPTION_RANK)

TESTINGMODE_WANT_SUSPEND = False

TESTINGMODE_WANT_VACATE = False

TESTINGMODE_START = True

TESTINGMODE_SUSPEND = False

TESTINGMODE_CONTINUE = True

TESTINGMODE_PREEMPT = False

TESTINGMODE_KILL = False

TESTINGMODE_PERIODIC_CHECKPOINT = False

TESTINGMODE_PREEMPTION_REQUIREMENTS = False

TESTINGMODE_PREEMPTION_RANK = 0

LOG = $(LOCAL_DIR)/log

SPOOL = $(LOCAL_DIR)/spool

EXECUTE = $(LOCAL_DIR)/execute

BIN = $(RELEASE_DIR)/bin

LIB = $(RELEASE_DIR)/lib

SBIN = $(RELEASE_DIR)/sbin

HISTORY = $(SPOOL)/history

COLLECTOR_LOG = $(LOG)/CollectorLog

KBDD_LOG = $(LOG)/KbdLog

MASTER_LOG = $(LOG)/MasterLog

NEGOTIATOR_LOG = $(LOG)/NegotiatorLog

NEGOTIATOR_MATCH_LOG = $(LOG)/MatchLog

SCHEDD_LOG = $(LOG)/SchedLog

SHADOW_LOG = $(LOG)/ShadowLog

STARTD_LOG = $(LOG)/StartLog

STARTER_LOG = $(LOG)/StarterLog

SHADOW_LOCK = $(LOCK)/ShadowLock

COLLECTOR_HOST = $(CONDOR_HOST)

NEGOTIATOR_HOST = $(CONDOR_HOST)

RESERVED_DISK = 5

DAEMON_LIST = MASTER, STARTD, SCHEDD

MASTER = $(SBIN)/condor_master

STARTD = $(SBIN)/condor_startd

SCHEDD = $(SBIN)/condor_schedd

KBDD = $(SBIN)/condor_kbdd

NEGOTIATOR = $(SBIN)/condor_negotiator

COLLECTOR = $(SBIN)/condor_collector

GRID_MONITOR = $(SBIN)/grid_monitor.sh

MASTER_ADDRESS_FILE = $(LOG)/.master_address

PREEN = $(SBIN)/condor_preen

PREEN_ARGS = -m -r

STARTER_LIST = STARTER, STARTER_PVM, STARTER_STANDARD

STARTER = $(SBIN)/condor_starter

STARTER_PVM = $(SBIN)/condor_starter.pvm

STARTER_STANDARD = $(SBIN)/condor_starter.std

STARTD_ADDRESS_FILE = $(LOG)/.startd_address

BenchmarkTimer = (CurrentTime - LastBenchmark)

RunBenchmarks : (LastBenchmark == 0 ) || ($(BenchmarkTimer) >= (4 * $(HOUR)))

CONSOLE_DEVICES = mouse, console

COLLECTOR_HOST_STRING = "$(COLLECTOR_HOST)"

STARTD_EXPRS = COLLECTOR_HOST_STRING

STARTD_JOB_EXPRS = ImageSize, ExecutableSize, JobUniverse, NiceUser

SHADOW_LIST = SHADOW, SHADOW_PVM, SHADOW_STANDARD

SHADOW = $(SBIN)/condor_shadow

SHADOW_PVM = $(SBIN)/condor_shadow.pvm

SHADOW_STANDARD = $(SBIN)/condor_shadow.std

SCHEDD_ADDRESS_FILE = $(LOG)/.schedd_address

SHADOW_SIZE_ESTIMATE = 1800

SHADOW_RENICE_INCREMENT = 10

PERIODIC_EXPR_INTERVAL = 60

QUEUE_SUPER_USERS = root, condor

PVMD = $(SBIN)/condor_pvmd

PVMGS = $(SBIN)/condor_pvmgs

VALID_SPOOL_FILES = job_queue.log, job_queue.log.tmp, history, \

Accountant.log, Accountantnew.log

INVALID_LOG_FILES = core

JAVA =

JAVA_MAXHEAP_ARGUMENT =

JAVA_CLASSPATH_DEFAULT = $(LIB) $(LIB)/scimark2lib.jar .

JAVA_CLASSPATH_ARGUMENT = -classpath

JAVA_CLASSPATH_SEPARATOR = :

JAVA_BENCHMARK_TIME = 2

JAVA_EXTRA_ARGUMENTS =

GRIDMANAGER = $(SBIN)/condor_gridmanager

GAHP = $(SBIN)/gahp_server

MAX_GRIDMANAGER_LOG = 1000000

GRIDMANAGER_DEBUG = D_COMMAND

GRIDMANAGER_LOG = /tmp/GridmanagerLog.$(USERNAME)

CRED_MIN_TIME_LEFT = 120

-