|
Hi Dan,
Another question: What is the version of the HTCondor Schedd that is sending jobs to your CE? Is this also v25 or something older like v24?
-Cole Bollig
From: Whitehouse, Dan <d.whitehouse@xxxxxxxxxxxxxx>
Sent: Wednesday, March 25, 2026 6:11 AM To: HTCondor-Users Mail List <htcondor-users@xxxxxxxxxxx> Cc: Cole Bollig <cabollig@xxxxxxxx> Subject: Re: OsUser job-router crashes
Hi Cole,
Thanks very much for your assistance with this.
The additional diagnostics are as follows:
Arguments = ""
BatchQueue = ""
BatchRuntime = 259200
BytesRecvd = 28044.0
BytesSent = 73780.0
CERequirements = "CondorCE"
ClusterId = 1639290
Cmd = "DIRAC_9xq1tffd_pilotwrapper.py"
CommittedSlotTime = 0
CommittedSuspensionTime = 0
CommittedTime = 0
CompletionDate = 0
CondorCE = 1
CpusProvisioned = 8
CpusUsage = 0.1141048636842512
CumulativeRemoteSysCpu = 0.0
CumulativeRemoteUserCpu = 0.0
CumulativeSlotTime = 0
CumulativeSuspensionTime = 0
CurrentHosts = 0
DiskProvisioned = 7671
DiskUsage = 2250000
DiskUsage_RAW = 2250000
EnteredCurrentStatus = 1774431475
Environment = "CONDORCE_COLLECTOR_HOST=<...>:9619 DIRAC_PILOT_STAMP=<...> HTCONDOR_JOBID=1328108.2 LANG=en_GB.UTF-8"
Err = "1328108.2.err"
ExecutableSize = 30
ExecutableSize_RAW = 28
ExitBySignal = false
ExitCode = 1
ExitStatus = 0
HoldReason = "The job attribute OnExitHold _expression_ 'ExitCode =!= 0' evaluated to TRUE"
HoldReasonCode = 3
HoldReasonSubCode = 55
ImageSize = 500000
ImageSize_RAW = 500000
In = "/dev/null"
Iwd = "/var/lib/condor-ce/spool/8108/2/cluster1328108.proc2.subproc0"
JobCurrentStartDate = 1774344542
JobCurrentStartExecutingDate = 1774344542
JobIsRunning = (JobStatus =!= 1) && (JobStatus =!= 5)
JobLeaseDuration = 2400
JobMemory = RequestMemory
JobNotification = 0
JobPrio = 0
JobRunCount = 1
JobStartDate = 1774344542
JobStatus = 1
JobSubmitFile = "/opt/dirac/data/HTCondor/work/HTCondorCE_uzpqa_kg.sub"
JobSubmitMethod = 0
JobUniverse = 5
KillSig = "SIGTERM"
LastHoldReason = "Spooling input data files”
LastHoldReasonCode = 16
LastJobStatus = 5
LastReleaseReason = "Data files spooled"
LastSuspensionTime = 0
LeaveJobInQueue = JobStatus == 4
Managed = "Schedd"
ManagedManager = ""
MaxHosts = 1
MemoryProvisioned = 2048
MemoryUsage = ((ResidentSetSize + 1023) / 1024)
MinHosts = 1
MyType = "Job"
NumCkpts = 0
NumCkpts_RAW = 0
NumHolds = 1
NumHoldsByReason = [ JobPolicy = 1 ]
NumJobCompletions = 0
NumJobMatches = 1
NumJobStarts = 1
NumRestarts = 0
NumShadowStarts = 1
NumSystemHolds = 0
=!= 0
>
Out = "1328108.2.out"
Owner = "<...>"
ProcId = 0
QDate = 1774431475
Rank = 0.0
ReleaseReason = "Data files spooled"
RemoteSysCpu = 0.0
RemoteUserCpu = 0.0
Remote_JobUniverse = 5
RequestCpus = 8
RequestDisk = DiskUsage
RequestMemory = 2000
Requirements = (NumJobStarts == 0) && (TARGET.Arch == "X86_64") && (TARGET.OpSys == "LINUX") && (TARGET.Disk >= RequestDisk) && (TARGET.Memory >= RequestMemory) && (TARGET.Cpus >= RequestCpus) && (TARGET.HasFileTransfer)
ResidentSetSize = 47500
ResidentSetSize_RAW = 47500
RouteName = "Condor_Pool"
RoutedBy = "htcondor-ce"
RoutedFromJobId = "1328108.2"
RoutedJob = true
SUBMIT_Cmd = "/opt/dirac/data/HTCondor/work/DIRAC_9xq1tffd_pilotwrapper.py"
SUBMIT_UserLog = "/opt/dirac/data/HTCondor/work/<...>/3/17/1328108.2.log"
SUBMIT_x509userproxy = "/tmp/tmpg2sfoji_"
ScitokensFile = "/opt/dirac/data/HTCondor/work/HTCondorCE_yxnffj4i.token"
ScratchDirFileCount = 55899
ShadowBday = 1774344542
ShouldTransferFiles = "YES"
SpooledOutputFiles = ""
StreamErr = false
StreamOut = false
TargetType = "Machine"
TotalSuspensions = 0
TransferIn = false
TransferInputSizeMB = 0
TransferOutput = ""
TransferOutputRemaps = undefined
WhenToTransferOutput = "ON_EXIT_OR_EVICT"
orig_AuthTokenId = "<...>"
orig_AuthTokenIssuer = "https://dteam-auth.cern.ch/"
orig_AuthTokenScopes = "compute.create,compute.read,compute.cancel,compute.modify"
orig_AuthTokenSubject = "<...>"
orig_OnExitHold = ExitCode =!= 0
orig_OnExitHoldSubCode = 55
orig_environment = "DIRAC_PILOT_STAMP=<...> HTCONDOR_JOBID=1328108.2"
osg_environment = ""
remote_NodeNumber = 1
remote_SMPGranularity = 1
x509UserProxyEmail = "<...>"
x509UserProxyExpiration = 1774392262
x509UserProxyFQAN = "<...>"
x509UserProxyFirstFQAN = "/dteam/Role=NULL/Capability=NULL"
x509UserProxyVOName = "dteam"
x509userproxy = "tmpg2sfoji_"
x509userproxysubject = "<...>"
03/25/26 09:38:20 Failed to find OsUser or User in job ad.
03/25/26 09:38:20 ERROR "Failed to initialize user ids." at line 64 in file /var/lib/condor/execute/slot1/dir_3041422/scratch/userdir/build-b0FsGA/BUILD/condor-25.0.8/src/condor_utils/set_user_priv_from_ad.cpp
|