Mailing List Archives
Authenticated access
|
|
|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Condor-users] Why does job submitted by virtual machine(running Xen 3.0) not run?
- Date: Thu, 1 Jun 2006 19:20:39 +0800
- From: "Yufang Zhang" <zhangyufang@xxxxxxxxxx>
- Subject: [Condor-users] Why does job submitted by virtual machine(running Xen 3.0) not run?
Hi,
I have a Condor pool under linux.There is one
virtual machine(running Xen 3.0) and several real machines in the Condor pool.
All the jobs submitted by the real machines run quite well in the pool. But as
for the jobs submitted by the virtual machine(running Xen 3.0),they always stay
'idle' even all the machine in the pool were in the 'unclaimed' state. Can
anyone tell me why and how to force the job to run?
Thank you in advance for your help.
The operating system of the virtual machine is
Centos 4.2, and all the operating systems of the real machine are
Centos 4.3. I am using Condor 6.7.19 for Fedora Core 4
in the Condor pool.
The state of the Condor pool is as follows(gcnode034.cap is the virtual machine,gcnode022.cap,gcnode026.cap,gcnode038.cap are
real machines):
Name
OpSys
Arch
State Activity
LoadAv Mem ActvtyTime
vm1@gcnode022 LINUX INTEL Unclaimed
Idle 0.000 1014
0+01:32:10
vm2@gcnode022
LINUX INTEL Unclaimed
Idle 0.000 1014
0+03:42:42
vm3@gcnode022
LINUX INTEL Unclaimed
Idle 0.000 1014
0+03:43:46
vm4@gcnode022
LINUX INTEL Unclaimed
Idle 0.000 1014
0+03:43:42
vm1@gcnode026
LINUX INTEL Unclaimed
Idle 0.000 503
0+03:18:47
vm2@gcnode026
LINUX INTEL Unclaimed
Idle 0.000 503
0+01:05:05
gcnode034.cap
LINUX INTEL Unclaimed
Idle 0.000 800
0+03:21:40
gcnode038.cap
LINUX INTEL Unclaimed
Idle 0.000 800
0+02:56:40
Total Owner Claimed Unclaimed Matched Preempting Backfill
INTEL/LINUX 8
0
0
8
0
0 0
Total 8
0
0
8
0
0 0
I have edited the local config file(condor_config) of every machine in the pool and add the following
lines:
START = True
SUSPEND =
False
CONTINUE = True
PREEMPT =
False
KILL =
False
But the jobs still stay
'idle'.
When I use the condor_q
-l command,I can see as
follows:
-- Submitter: gcnode034.cap : <192.168.10.34:47204> :
gcnode034.cap
MyType = "Job"
TargetType = "Machine"
ClusterId =
12
QDate = 1149147167
CompletionDate = 0
Owner =
"condor"
RemoteWallClockTime = 0.000000
LocalUserCpu =
0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu =
0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
NumSystemHolds
= 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime =
0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion =
"$CondorVersion: 6.7.19 May 10 2006 $"
CondorPlatform = "$CondorPlatform:
I386-LINUX_RH9 $"
RootDir = "/"
Iwd = "/home/condor"
JobUniverse =
1
Cmd = "/home/condor/fortIO.remote"
MinHosts = 1
MaxHosts =
1
CurrentHosts = 0
WantRemoteSyscalls = TRUE
WantCheckpoint =
TRUE
JobStatus = 1
EnteredCurrentStatus = 1149147167
JobPrio =
0
User = "condor@xxxxxxxxxxxxx"
NiceUser =
FALSE
MaxJobRetirementTime = 0
Environment = ""
JobNotification =
2
WantRemoteIO = TRUE
UserLog = "/home/condor/fortIO.log"
CoreSize =
0
KillSig = "SIGTSTP"
Rank = 0.000000
In = "/dev/null"
TransferIn =
FALSE
Out = "fortIO.out"
StreamOut = FALSE
Err =
"fortIO.err"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize =
32768
ShouldTransferFiles = "NO"
TransferFiles = "NEVER"
ImageSize_RAW
= 13356
ImageSize = 20000
ExecutableSize_RAW = 13356
ExecutableSize =
20000
DiskUsage_RAW = 13356
DiskUsage = 20000
Requirements = (Arch ==
"INTEL") && (OpSys == "LINUX") && ((CkptArch == Arch) ||
(CkptArch =?= UNDEFINED)) && ((CkptOpSys == OpSys) || (CkptOpSys =?=
UNDEFINED)) && (Disk >= DiskUsage) && ((Memory * 1024) >=
ImageSize)
FileSystemDomain = "gcnode034.cap"
PeriodicHold =
FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
>>LeaveJobInQueue = FALSE
Arguments =
""
GlobalJobId = "gcnode034.cap#1149147167#12.0"
ProcId =
0
AutoClusterId = 0
AutoClusterAttrs =
"JobUniverse,LastCheckpointPlatform,NumCkpts,DiskUsage,ImageSize,Requirements"
ServerTime
= 1149159854
MyType = "Job"
TargetType = "Machine"
ClusterId =
13
QDate = 1149147167
CompletionDate = 0
Owner =
"condor"
RemoteWallClockTime = 0.000000
LocalUserCpu =
0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu =
0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
NumSystemHolds
= 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime =
0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion =
"$CondorVersion: 6.7.19 May 10 2006 $"
CondorPlatform = "$CondorPlatform:
I386-LINUX_RH9 $"
RootDir = "/"
Iwd = "/home/condor"
JobUniverse =
5
Cmd = "/home/condor/sh_loop"
MinHosts = 1
MaxHosts =
1
CurrentHosts = 0
WantRemoteSyscalls = FALSE
WantCheckpoint =
FALSE
JobStatus = 1
EnteredCurrentStatus = 1149147167
JobPrio =
0
User = "condor@xxxxxxxxxxxxx"
NiceUser =
FALSE
Environment = ""
JobNotification = 2
WantRemoteIO =
TRUE
UserLog = "/home/condor/sh_loop.log"
CoreSize = 0
KillSig =
"SIGTERM"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out =
"sh_loop.out"
StreamOut = FALSE
Err = "sh_loop.err"
StreamErr =
FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles =
"IF_NEEDED"
WhenToTransferOutput = "ON_EXIT"
TransferFiles =
"ONEXIT"
ImageSize_RAW = 1
ImageSize = 10000
ExecutableSize_RAW =
1
ExecutableSize = 10000
DiskUsage_RAW = 1
DiskUsage =
10000
Requirements = (Arch == "INTEL") && (OpSys == "LINUX")
&& (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize)
&& ((HasFileTransfer) || (TARGET.FileSystemDomain ==
MY.FileSystemDomain))
FileSystemDomain = "gcnode034.cap"
JobLeaseDuration
= 1200
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove =
FALSE
>>LeaveJobInQueue =
FALSE
Args = "60"
GlobalJobId = "gcnode034.cap#1149147167#13.0"
ProcId
= 0
AutoClusterId = 1
AutoClusterAttrs =
"JobUniverse,LastCheckpointPlatform,NumCkpts,DiskUsage,ImageSize,FileSystemDomain,Requirements"
ServerTime
= 1149159854
Best wishes!
_________
Yufang Zhang
2006-06-01