BrianOn Apr 7, 2015, at 9:42 AM, Sridhar Thumma <deadman.den@xxxxxxxxx> wrote:Hi,Please see my comments inline:On Tue, Apr 7, 2015 at 7:55 PM, Brian BockelmanÂ<bbockelm@xxxxxxxxxxx>wrote:Hi Sridhar,The configuration seems reasonable. However, weâd need more context to know if itâs working as expected.1) Did you run condor_reconfig after changing the configuration?I restarted condor using condor_restart. This should refresh config values, right?Yup, that should be fine.Â2) Can you give an example classad of a job you think should be released under this policy?I submitted a grid job where AMI ID is not valid. If AMI ID is not valid, job will go into held state. In this case, it should retry for configured no of times. make sense?ÂI actually want to useÂSYSTEM_PERIODIC_RELEASEÂto release jobs which are going held state because of service unavailable error from Amazon. Using above test to valid my configuration as it is not possible to testÂservice unavailable error condition now.Yes - I understood this part. However, to understand why itâs not doing what you think it should, weâd need to actually see the classad.
_______________________________________________
HTCondor-users mailing list
To unsubscribe, send a message to htcondor-users-request@xxxxxxxxxxx with a
subject: Unsubscribe
You can also unsubscribe by visiting
https://lists.cs.wisc.edu/mailman/listinfo/htcondor-users
The archives can be found at:
https://lists.cs.wisc.edu/archive/htcondor-users/
-- Submitter: cgw_dev@xxxxxxxxxxxxxxxxxxxxxx : <172.16.130.74:35003> : iad-dev-htc-a1.pdx.aws MaxHosts = 1 Managed = "Schedd" User = "cgw_dev@xxxxxxxxxxxxxxxxxxxxxx" OnExitHold = false CoreSize = 0 LastRemoteStatusUpdate = 1428418906 WantRemoteSyscalls = false MyType = "Job" Rank = 500 - TotalLoadAvg CumulativeSuspensionTime = 0 ReleaseReason = undefined MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/gps-ccgs-blat-test-1/logs/gene_names.err" Submission = "cgw_dev@xxxxxxxxxxxxxxxxxxxxxx#2868" ProcId = 0 EnteredCurrentStatus = 1428418912 UserLog = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/gps-ccgs-blat-test-1/logs/gene_names.log" HoldReasonSubCode = 0 NumJobStarts = 0 JobUniverse = 9 In = "/dev/null" Requirements = true EC2VpcSubnet = "subnet-503cb427" ClusterId = 2868 WhenToTransferOutput = "ON_EXIT" CompletionDate = 0 EC2AmiID = "ami-c4f2d0ac" BufferSize = 524288 Environment = "LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01;31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36: _=/usr/bin/condor_submit QTINC=/usr/lib64/qt-3.3/include CVS_RSH=ssh QTLIB=/usr/lib64/qt-3.3/lib HISTCONTROL=ignoredups PWD=/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/logs SHLVL=1 LANG=en_US.UTF-8 TERM=cygwin MAIL=/var/spool/mail/cgw_dev OLDPWD=/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436 LESSOPEN=||/usr/bin/lesspipe.sh' '%s SSH_ASKPASS=/usr/libexec/openssh/gnome-ssh-askpass G_BROKEN_FILENAMES=1 QTDIR=/usr/lib64/qt-3.3 SHELL=/bin/bash USER=cgw_dev PATH=/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/cgw_dev/bin HISTSIZE=1000 LOGNAME=cgw_dev HOSTNAME=iad-dev-htc-a1.pdx.aws HOME=/home/cgw_dev" EC2TagName = "IAD-DEV-GET_GENE_NAMES_SCRRIPT" TargetType = "Machine" LeaveJobInQueue = false JobNotification = 1 Owner = "cgw_dev" CondorPlatform = "$CondorPlatform: X86_64-ScientificLinux_6.6 $" CommittedTime = 0 QDate = 1428418903 TransferIn = false ExitStatus = 0 NumCkpts_RAW = 0 HoldReason = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<Response><Errors><Error><Code>InvalidAMIID.NotFound</Code><Message>The image id 'ami-c4f2d0ac' does not exist</Message></Error></Errors><RequestID>eec8cbed-966f-4c4e-b2a6-dbf95310a56f</RequestID></Response>" RootDir = "/" CurrentHosts = 0 GlobalJobId = "cgw_dev@xxxxxxxxxxxxxxxxxxxxxx#2868.0#1428418903" RemoteSysCpu = 0.0 TotalSuspensions = 0 WantCheckpoint = false TransferExecutable = false PeriodicRelease = false CondorVersion = "$CondorVersion: 7.8.10 Jan 19 2015 BuildID: RH-7.8.10-0.2.el6 $" Out = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/gps-ccgs-blat-test-1/logs/gene_names.out" ShouldTransferFiles = "IF_NEEDED" DiskUsage = 0 CumulativeSlotTime = 0 EC2SecurityGroups = "sg-433ce527" CommittedSlotTime = 0 LocalUserCpu = 0.0 NotifyUser = "sridhar.thumma@xxxxxxxxxxxxxxxxxxxxxx" DiskUsage_RAW = 0 ExitBySignal = false StreamErr = false HoldReasonCode = 0 NumSystemHolds = 1 NumRestarts = 0 RequestDisk = DiskUsage GridJobId = "ec2 http://ec2.us-east-1.amazonaws.com 7d7576f3-b90e-4e00-a535-90f4d91f6ad0" FileSystemDomain = "iad-dev-htc-a1.pdx.aws" JobPrio = 5 EC2UserDataFile = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/logs/gene_names_job_primary_userdata.sh" NumCkpts = 0 BufferBlockSize = 32768 ImageSize = 0 CommittedSuspensionTime = 0 ExecutableSize_RAW = 0 Cmd = "IAD-DEV-GET_GENE_NAMES_SCRRIPT" WantClaiming = false LocalSysCpu = 0.0 Iwd = "/mnt/gfs/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/logs" GridResource = "ec2 http://ec2.us-east-1.amazonaws.com" ServerTime = 1428419214 EC2InstanceType = "m3.medium" ImageSize_RAW = 0 LastSuspensionTime = 0 JobStatus = 5 ExecutableSize = 0 RemoteWallClockTime = 0.0 OnExitRemove = true Arguments = "" KillSig = "SIGTERM" StreamOut = false CurrentTime = time() RequestMemory = ifthenelse(MemoryUsage =!= undefined,MemoryUsage,( ImageSize + 1023 ) / 1024) RemoteUserCpu = 0.0 NiceUser = false RequestCpus = 1 EC2TagNames = "Name" WantRemoteIO = true LastJobStatus = 1