Mailing List Archives
Authenticated access
|
|
|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [HTCondor-users] Jobs are reporting cgroups kills at strange limits
- Date: Thu, 19 Feb 2026 13:20:09 +0100 (CET)
- From: "Beyer, Christoph" <christoph.beyer@xxxxxxx>
- Subject: Re: [HTCondor-users] Jobs are reporting cgroups kills at strange limits
Hi,
there were some problems in early EL9 versions and condor with correct memory reporting but that is long fixed.
Do you have a sytemperiodic-hold expression configured and if so could you post it ?
Best
christoph
--
Christoph Beyer
DESY Hamburg
IT-Department
Notkestr. 85
Building 02b, Room 009
22607 Hamburg
phone:+49-(0)40-8998-2317
mail: christoph.beyer@xxxxxxx
----- UrsprÃngliche Mail -----
Von: svatosm@xxxxxx
An: "htcondor-users" <htcondor-users@xxxxxxxxxxx>
CC: vs-admin@xxxxxx
Gesendet: Donnerstag, 19. Februar 2026 13:12:33
Betreff: [HTCondor-users] Jobs are reporting cgroups kills at strange limits
Hi,
we would like to ask for advice with problem we see in our HTCondor
installation in Prague. We set up cgroups to watch job memory limits and
kill jobs when they go over the requested amount + 10% extra margin. But
when we are looking at the information about killed jobs in history
files, we get strange messages in LastHoldReason like this:
Error from slot1_10@xxxxxxxxxxxxxxxxxxxxxxxx: Job has gone over cgroup
memory limit of 16000 megabytes. Last measured usage: 1085 megabytes.Â
Consider resubmitting with a higher request_memory.
All the data from history files for that job are below. So, there are
couple of odd things in it. First, the RequestMemory is 1000 and the
last measured value is still below 110% of the request. Then the origin
of the 16000MB limit in the message. Would anyone know and explanation
for that?
thanks
Michal Svatos
AccountingGroup = "group_atlas.prod.atlasprd001"
AcctGroup = "group_atlas.prod"
AcctGroupUser = "atlasprd001"
ActivationDuration = 15628
ActivationSetupDuration = 2
ActivationTeardownDuration = 1769446065
Arguments = ""
BlockReadKbytes = 0
BlockReads = 0
BlockWriteKbytes = 0
BlockWrites = 0
BytesRecvd = 107197.0
BytesSent = 0.0
ClusterId = 4218714
Cmd = "/var/spool/arc/session/2bcb04c60a3e/condorjob.sh"
CommittedSlotTime = 0
CommittedSuspensionTime = 0
CommittedTime = 0
CondorPlatform = "$CondorPlatform: x86_64_AlmaLinux9 $"
CondorVersion = "$CondorVersion: 24.0.15 2025-12-12 BuildID: 856728
PackageID: 24.0.15-1 GitSHA: b96c5ce3 $"
CpusProvisioned = 8
CumulativeRemoteSysCpu = 832.0
CumulativeRemoteUserCpu = 15572.0
CumulativeSlotTime = 125040.0
CumulativeSuspensionTime = 0
CurrentHosts = 0
DiskProvisioned = 167779148
DiskUsage = 225000
DiskUsage_RAW = 216908
EnteredCurrentStatus = 1770035531
Environment = ""
Err = "/var/spool/arc/session/2bcb04c60a3e.comment"
ExecutableSize = 22
ExecutableSize_RAW = 21
ExecuteDirWasEncrypted = false
ExitBySignal = false
ExitStatus = 0
GPUsProvisioned = 0
GlobalJobId = "arc1.farm.particle.cz#4218714.0#1769417681"
ImageSize = 1250000
ImageSize_RAW = 1111608
In = "/dev/null"
Iwd = "/var/spool/arc/session/2bcb04c60a3e"
JobCpuLimit = 345600
JobCurrentFinishTransferInputDate = 1769430438
JobCurrentReconnectAttempt = undefined
JobCurrentStartDate = 1769430436
JobCurrentStartExecutingDate = 1769430439
JobCurrentStartTransferInputDate = 1769430438
JobDescription = "gridjob"
JobFinishedHookDone = 1770035532
JobLeaseDuration = 2400
JobMemoryLimit = 1024000
JobNotification = 0
JobPrio = 0
JobRunCount = 1
JobStartDate = 1769430436
JobStatus = 3
JobSubmitFile = "/var/spool/arc/session/2bcb04c60a3e/condorjob.jdl"
JobSubmitMethod = 0
JobTimeLimit = 345600
JobUniverse = 5
LastHoldReason = "Error from slot1_10@xxxxxxxxxxxxxxxxxxxxxxxx: Job has
gone over cgroup memory limit of 16000 megabytes. Last measured usage:
1085 megabytes. Consider resubmitting with a higher request_memory."
LastHoldReasonCode = 34
LastHoldReasonSubCode = 102
LastJobLeaseRenewal = 1769446065
LastJobStatus = 5
LastMatchTime = 1769430436
LastPublicClaimId =
"<172.16.17.4:9618?addrs=[2001-718-401-6017-20-0-17-4]-9618+172.16.17.4-9618&alias=turin04.farm.particle.cz&noUDP&sock=startd_4127_5b71>#1769425273#107#..."
LastRejMatchReason = "no match found "
LastRejMatchTime = 1769430402
LastRemoteHost = "slot1_10@xxxxxxxxxxxxxxxxxxxxxxxx"
LastRemoteWallClockTime = 15630.0
LastSuspensionTime = 0
LastVacateTime = 1769446065
LeaveJobInQueue = false
MATCH_EXP_MachineScalingFactorFZU = "2.174242424242424E+00"
MATCH_EXP_MachineScalingFactorHEPSPEC06 = "2.296000000000000E+01"
MATCH_EXP_MachineScalingSlotWeight = "8"
MachineAttrCpus0 = 8
MachineAttrScalingFactorFZU0 = 2.174242424242424
MachineAttrScalingFactorHEPSPEC060 = 22.96
MachineAttrSlotWeight0 = 8
MachineScalingFactorFZU = "$$([ifThenElse(isUndefined(ScalingFactorFZU),
1.00, ScalingFactorFZU)])"
MachineScalingFactorHEPSPEC06 =
"$$([ifThenElse(isUndefined(ScalingFactorHEPSPEC06), 10.56,
ScalingFactorHEPSPEC06)])"
MachineScalingSlotWeight = "$$([ifThenElse(isUndefined(SlotWeight),
0.00, SlotWeight)])"
MaxHosts = 1
MemoryProvisioned = 16000
MemoryUsage = ((ResidentSetSize + 1023) / 1024)
MinHosts = 1
MyType = "Job"
NordugridQueue = "grid"
NumCkpts = 0
NumCkpts_RAW = 0
NumHolds = 1
NumHoldsByReason = [ JobOutOfResources = 1 ]
NumJobCompletions = 0
NumJobMatches = 1
NumJobStarts = 1
NumRestarts = 0
NumShadowStarts = 1
NumSystemHolds = 0
OrigMaxHosts = 1
Out = "/var/spool/arc/session/2bcb04c60a3e.comment"
Owner = "atlasprd001"
PeriodicRemove = (JobStatus == 1 && NumJobStarts > 0) || RemoteUserCpu +
RemoteSysCpu > JobCpuLimit || RemoteWallClockTime > JobTimeLimit ||
(JobStatus == 1 && NumJobStarts > 0)
ProcId = 0
QDate = 1769417681
Rank = 0.0
RecentBlockReadKbytes = 0
RecentBlockReads = 0
RecentBlockWriteKbytes = 0
RecentBlockWrites = 0
RecentStatsLifetimeStarter = 1200
RemoteSysCpu = 832.0
RemoteUserCpu = 15572.0
RemoteWallClockTime = 15630.0
RemoveReason = "via condor_rm (by user atlasprd001)"
RequestCpus = 1
RequestDisk = 20971520 * RequestCpus
RequestMemory = 1000
Requirements = ((NumJobStarts == 0) && (((Arch == "X86_64") && (OpSys
=?= "LINUX") && ((OpSysName =?= "CentOS") || (OpSysName =?=
"AlmaLinux")) && (OpSysMajorVer =?= 9)))) && (TARGET.Disk >=
RequestDisk) && (TARGET.Memory >= RequestMemory) &&
(TARGET.HasFileTransfer) && (NumJobStarts == 0)
ResidentSetSize = 1000000
ResidentSetSize_RAW = 997532
ScratchDirFileCount = 2727
ShouldTransferFiles = "YES"
StartdPrincipal = "execute-side@matchsession/2001:718:401:6017:20:0:17:4"
StatsLifetimeStarter = 15626
StreamErr = false
StreamOut = false
TargetType = "Machine"
TotalSubmitProcs = 1
TotalSuspensions = 0
TransferIn = false
TransferInFinished = 1769430438
TransferInStarted = 1769430438
TransferInput = "/var/spool/arc/session/2bcb04c60a3e"
TransferInputSizeMB = 0
TransferInputStats = [ CedarFilesCountTotal = 9; CedarFilesCountLastRun
= 9 ]
TransferOutputStats = [Â ]
User = "atlasprd001@xxxxxxxxxxxxxxxx"
UserLog = "/var/spool/arc/session/2bcb04c60a3e/log"
VacateReason = "Error from slot1_10@xxxxxxxxxxxxxxxxxxxxxxxx: Job has
gone over cgroup memory limit of 16000 megabytes. Last measured usage:
1085 megabytes. Consider resubmitting with a higher request_memory."
VacateReasonCode = 34
VacateReasonSubCode = 102
WhenToTransferOutput = "ON_EXIT_OR_EVICT"
use_x509userproxy = true
x509UserProxyEmail = "atlas.pilot1@xxxxxxx"
x509UserProxyExpiration = 1769760905
x509UserProxyFQAN = "/DC=ch/DC=cern/OU=Organic
Units/OU=Users/CN=atlpilo1/CN=614260/CN=Robot: ATLAS
Pilot1,/atlas/Role=production/Capability=NULL,/atlas/Role=NULL/Capability=NULL,/atlas/usatlas/Role=NULL/Capability=NULL"
x509UserProxyFirstFQAN = "/atlas/Role=production/Capability=NULL"
x509UserProxyVOName = "atlas"
x509userproxy = "/var/spool/arc/session/2bcb04c60a3e/user.proxy"
x509userproxysubject = "/DC=ch/DC=cern/OU=Organic
Units/OU=Users/CN=atlpilo1/CN=614260/CN=Robot: ATLAS Pilot1"
*** Offset = 29744629 ClusterId = 4218714 ProcId = 0 Owner =
"atlasprd001" CompletionDate = -1
_______________________________________________
HTCondor-users mailing list
To unsubscribe, send a message to htcondor-users-request@xxxxxxxxxxx with a
subject: Unsubscribe
The archives can be found at: https://www-auth.cs.wisc.edu/lists/htcondor-users/