Mailing List Archives
Authenticated access
|
|
|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Condor-users] Jobs that won't condor_rm
- Date: Wed, 28 Dec 2005 14:38:51 -0600 (CST)
- From: Steven Timm <timm@xxxxxxxx>
- Subject: [Condor-users] Jobs that won't condor_rm
I have some jobs that users submitted to globus universe
in the queue of my schedd right now and I cannot, as root, remove
them with condor_rm.
When I execute the condor_rm command, they show "X" for a few
seconds and then revert to status "H"
000 (26058.000.000) 12/21 09:11:17 Job submitted from host:
<131.225.167.42:347
84>
...
017 (26058.000.000) 12/21 09:11:26 Job submitted to Globus
RM-Contact: fngp-osg.fnal.gov/jobmanager-condor
JM-Contact: fngp-osg.fnal.gov/jobmanager-condor
Can-Restart-JM: 1
...
001 (26058.000.000) 12/21 09:14:00 Job executing on host: gt2
fngp-osg.fnal.gov
/jobmanager-condor
...
012 (26058.000.000) 12/25 10:14:50 Job was held.
Globus error 22: the job manager failed to create an internal
script argument file
Code 2 Subcode 22
...
012 (26058.000.000) 12/28 14:22:04 Job was held.
Globus error 7: authentication with the remote server failed
Code 2 Subcode 7
...
012 (26058.000.000) 12/28 14:35:13 Job was held.
Globus error 7: authentication with the remote server failed
Code 2 Subcode 7
...
-------------------------------------------------------------
I have made sure that there are no condor processes running
on the "remote server" which in this case is the same as the submit
machine. It appears that on the condor_rm of the globus universe job
it tries to contact the remote server to kill the job, but can't do
so because the proxy has obviously expired long ago.
Any idea how to get rid of such a job? Output of condor_q -long
is below for one of them.
Steve Timm
-- Submitter: fngp-osg.fnal.gov : <131.225.167.42:34784> :
fngp-osg.fnal.gov
MyType = "Job"
TargetType = "Machine"
ClusterId = 26058
QDate = 1135177877
CompletionDate = 0
Owner = "yoo"
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion = "$CondorVersion: 6.7.12 Sep 24 2005 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
RootDir = "/"
Iwd = "/home/yoo/project/cdms/DarkPipe/DP_bin_10.16"
JobUniverse = 9
Cmd = "/home/yoo/project/cdms/DarkPipe/DP_bin_10.16/runpipeclean.sh"
MinHosts = 1
MaxHosts = 1
CurrentHosts = 0
WantRemoteSyscalls = FALSE
WantCheckpoint = FALSE
RemoteSpoolDir = "/local/stage1/condor/spool/cluster26058.proc0.subproc0"
x509userproxysubject = "/DC=org/DC=doegrids/OU=People/CN=Jonghee Yoo
223786"
x509userproxy = "/tmp/x509up_u11998"
JobPrio = 0
User = "yoo@xxxxxxxx"
NiceUser = FALSE
Env = ""
JobNotification = 0
WantRemoteIO = TRUE
UserLog =
"/home/yoo/project/cdms/DarkPipe/DP_bin_10.16/grid_cdms/grid_cdms.log.26058.0"
CoreSize = 0
KillSig = "SIGTERM"
Rank = (Mips)
In = "/dev/null"
TransferIn = FALSE
Out = "grid_cdms/grid_cdms.out.26058.0"
StreamOut = TRUE
Err = "grid_cdms/grid_cdms.err.26058.0"
StreamErr = TRUE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "NO"
TransferFiles = "NEVER"
ImageSize = 1
ExecutableSize = 1
DiskUsage = 1
Requirements = TRUE
FileSystemDomain = "fnal.gov"
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
OnExitHold = FALSE
OnExitRemove = TRUE
LeaveJobInQueue = FALSE
Args = "140726_1601"
GridResource = "gt2 fngp-osg.fnal.gov/jobmanager-condor"
GlobusResubmit = FALSE
WantClaiming = FALSE
GlobusRSL = "(jobtype=single)(maxwalltime=999)"
GlobalJobId = "fngp-osg.fnal.gov#1135177877#26058.0"
ProcId = 0
GlobusGramVersion = 3
NumGlobusSubmits = 1
GridJobId = "gt2 fngp-osg.fnal.gov/jobmanager-condor
https://fngp-osg.fnal.gov:49645/23112/1135177881/"
GlobusStatus = 0
RemoteWallClockTime = 349250.000000
WallClockCheckpoint = UNDEFINED
ShadowBday = 0
RemoveReason = "via condor_rm (by user root)"
JobStatusOnRelease = 3
JobStatus = 5
EnteredCurrentStatus = 1135802113
HoldReason = "Globus error 7: authentication with the remote server
failed"
HoldReasonCode = 2
HoldReasonSubCode = 7
ReleaseReason = UNDEFINED
NumSystemHolds = 3
Managed = "Schedd"
ServerTime = 1135802274
--
------------------------------------------------------------------
Steven C. Timm, Ph.D (630) 840-8525 timm@xxxxxxxx http://home.fnal.gov/~timm/
Fermilab Computing Div/Core Support Services Dept./Scientific Computing Section
Assistant Group Leader, Farms and Clustered Systems Group
Lead of Computing Farms Team