I have been working on this all day and I have made
some good progress using SSH tunnelling.
I have edited my condor_config to only execute the
master and the schedd and set the central manager variable to
127.0.0.1
The following commands.
ssh -fN -i ~/.ssh/clusterlogin -L
9618:thebeast:9618 condor@xxxxxxxxxxxxxx &
ssh -fN -i ~/.ssh/clusterlogin -L
9614:thebeast:9614 condor@xxxxxxxxxxxxxx
&
-- automatically logs me into my condor central
manager (firewalled) without asking for password - which is so I can
execute
into the background with the "&" and then
forwards port 9618 on the central manager machine to my local
machine. So I can now successfully execute condor_status and
get a listing of my pool. which is great.
and i can also query the negotiator with condor_q
-ana and get an answer.
Now i try and submit a job but its failes because
of my own requirement.
This job submits fine from the central manager
itself so Im thinking it has to do with my port forwarding setup.
condor_q -l 15.0 reports
-- Submitter: tux.neuralgrid.org :
<146.191.100.202:44953> : tux.neuralgrid.org
MyType = "Job" TargetType = "Machine" ClusterId = 15 QDate = 1129590058 CompletionDate = 0 Owner = "chris" RemoteWallClockTime = 0.000000 LocalUserCpu = 0.000000 LocalSysCpu = 0.000000 RemoteUserCpu = 0.000000 RemoteSysCpu = 0.000000 ExitStatus = 0 NumCkpts = 0 NumRestarts = 0 NumSystemHolds = 0 CommittedTime = 0 TotalSuspensions = 0 LastSuspensionTime = 0 CumulativeSuspensionTime = 0 ExitBySignal = FALSE CondorVersion = "$CondorVersion: 6.6.8 Jan 27 2005 $" CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $" RootDir = "/" Iwd = "/home/chris/jobs/helloworld" JobUniverse = 5 Cmd = "/home/chris/jobs/helloworld/helloworld" MinHosts = 1 MaxHosts = 1 CurrentHosts = 0 WantRemoteSyscalls = FALSE WantCheckpoint = FALSE JobStatus = 1 EnteredCurrentStatus = 1129590058 JobPrio = 0 User = "chris@xxxxxxxxxxxxxxxxxx" NiceUser = FALSE Env = "" JobNotification = 2 UserLog = "/home/chris/jobs/helloworld/log.out" CoreSize = 0 KillSig = "SIGTERM" Rank = 0.000000 In = "/dev/null" TransferIn = FALSE Out = "output_0.out" Err = "error_0.out" BufferSize = 524288 BufferBlockSize = 32768 ShouldTransferFiles = "YES" WhenToTransferOutput = "ON_EXIT" TransferFiles = "ONEXIT" ImageSize = 11 ExecutableSize = 11 DiskUsage = 11 Requirements = ((Arch == "X86_64") && (OpSys == "LINUX")) && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && (HasFileTransfer) PeriodicHold = FALSE PeriodicRelease = FALSE PeriodicRemove = FALSE >>LeaveJobInQueue = FALSE Args = "" ProcId = 0 ServerTime = 1129591068 condor_status -l reports
MyType = "Machine"
TargetType = "Job" Name = "vm2@xxxxxxxxxxxxxxxxx" Machine = "node9.cluster.int" Rank = 0.000000 CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000) COLLECTOR_HOST_STRING = "thebeast.cluster.int" CondorVersion = "$CondorVersion: 6.7.10 Aug 3 2005 $" CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $" VirtualMachineID = 2 VirtualMemory = 524408 Disk = 219531280 CondorLoadAvg = 0.000000 LoadAvg = 0.000000 KeyboardIdle = 40969102 ConsoleIdle = 40969102 Memory = 2048 Cpus = 1 StartdIpAddr = "<192.168.1.109:33847>" Arch = "X86_64" OpSys = "LINUX" UidDomain = "node9.cluster.int" FileSystemDomain = "node9.cluster.int" Subnet = "192.168.1" HasIOProxy = TRUE TotalVirtualMemory = 1048816 TotalDisk = 439062560 TotalCpus = 2 TotalMemory = 4096 KFlops = 595101 Mips = 2218 LastBenchmark = 1129584784 TotalLoadAvg = 0.000000 TotalCondorLoadAvg = 0.000000 ClockMin = 63 ClockDay = 2 TotalVirtualMachines = 2 HasFileTransfer = TRUE HasPerFileEncryption = TRUE HasReconnect = TRUE HasMPI = TRUE HasTDP = TRUE HasJICLocalConfig = TRUE HasJICLocalStdin = TRUE HasPVM = TRUE HasRemoteSyscalls = TRUE HasCheckpointing = TRUE StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJI CLocalConfig,HasJICLocalStdin,HasPVM,HasRemoteSyscalls,HasCheckpointing" CpuBusyTime = 0 CpuIsBusy = FALSE TimeToLive = 2147483647 State = "Unclaimed" EnteredCurrentState = 1129080334 Activity = "Idle" EnteredCurrentActivity = 1129584784 Start = TRUE Requirements = START MaxJobRetirementTime = 0 CurrentRank = 0.000000 MonitorSelfTime = 1129593698 MonitorSelfCPUUsage = 0.004167 MonitorSelfImageSize = 6896.000000 MonitorSelfResidentSetSize = 3256 MonitorSelfAge = 512393 DaemonStartTime = 1129080324 UpdateSequenceNumber = 1710 MyAddress = "<192.168.1.109:33847>" LastHeardFrom = 1129590591 UpdatesTotal = 1711 UpdatesSequenced = 1710 UpdatesLost = 0 UpdatesHistory = "0x00000000000000000000000000000000" Is there anything simple I am missing?
Chris
|