HTCondor Project List Archives



[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-devel] Patches to src/condor_shadow.V6/pseudo_ops.C [2/3]



The second patch (applied after the first) changes the way retry_wait
is calculated.  Instead of doubling every iteration, it is incremented
by an amount that goes up by 20 seconds each time.

The old sequence was:

 5, 10,  20,  40,  80, 160, 320, 640, 1280, 2560 (totaling 5115)

The new sequence is:

20, 60, 120, 200, 300, 420, 560, 720,  900, 1100 (totaling 4400)

Experience has shown that the initial retries are too close together
for a busy checkpoint server to have done any work, while the final
retries are spaced so far apart that the checkpoint server ends up
being mostly idle.  My opinion is that the second sequence of retries
will provide better results.

--- src/condor_shadow.V6/pseudo_ops.C.SAVE	Thu Dec 16 14:47:59 2004
+++ src/condor_shadow.V6/pseudo_ops.C.NEW	Fri Dec 17 14:39:02 2004
@@ -125,7 +125,7 @@
 // count of network bytes send and received outside of CEDAR RSC socket
 extern float BytesSent, BytesRecvd;
 
-const int MaxRetryWait = 3600;
+const int MaxRetryWait = 1200;
 static bool CkptWanted = true;	// WantCheckpoint from Job ClassAd
 static bool RestoreCkptWithNoScheddName = false; // compat with old naming
 
@@ -668,7 +668,7 @@
 	pid_t	child_pid;
 	int		rval;
 	PROC *p = (PROC *)Proc;
-	int		retry_wait;
+	int		retry_wait, retry_step;
 	bool	CkptFile = is_ckpt_file(file);
 	bool	ICkptFile = is_ickpt_file(file);
 	priv_state	priv;
@@ -698,7 +698,7 @@
 			// If LastCkptServer is not NULL, we stored our last checkpoint
 			// file on the checkpoint server.
 		SetCkptServerHost(LastCkptServer);
-		retry_wait = 5;
+		retry_wait = retry_step = 20;
 		do {
 			rval = RequestRestore(p->owner,
 								  (RestoreCkptWithNoScheddName) ? NULL :
@@ -712,7 +712,7 @@
 				dprintf(D_ALWAYS, "ckpt server restore failed, trying again"
 						" in %d seconds\n", retry_wait);
 				sleep(retry_wait);
-				retry_wait *= 2;
+				retry_wait += retry_step += 20;
 			}
 		} while (rval);
 
@@ -845,7 +845,7 @@
 	PROC *p = (PROC *)Proc;
 	bool	CkptFile = is_ckpt_file(file);
 	bool	ICkptFile = is_ickpt_file(file);
-	int		retry_wait = 5;
+	int		retry_wait = 20, retry_step = 20;
 	priv_state	priv;
 	mode_t	omask;
 
@@ -886,7 +886,7 @@
 				dprintf(D_ALWAYS, "store request to ckpt server failed, "
 						"trying again in %d seconds\n", retry_wait);
 				sleep(retry_wait);
-				retry_wait *= 2;
+				retry_wait += retry_step += 20;
 			}
 		} while (rval);
 
@@ -2175,7 +2175,7 @@
 int
 has_ckpt_file()
 {
-	int		rval, retry_wait = 5;
+	int		rval, retry_wait = 20, retry_step = 20;
 	PROC *p = (PROC *)Proc;
 	priv_state	priv;
 	long	accum_usage;
@@ -2201,7 +2201,7 @@
 			dprintf(D_ALWAYS, "failed to contact ckpt server, trying again"
 					" in %d seconds\n", retry_wait);
 			sleep(retry_wait);
-			retry_wait *= 2;
+			retry_wait += retry_step += 20;
 		}
 	} while(rval == -1 && LastCkptServer && accum_usage > MaxDiscardedRunTime);
 	if (rval == -1) { /* not on local disk & not using ckpt server */

-- 
Daniel K. Forrest	Laboratory for Molecular and
forrest@xxxxxxxxxxxxx	Computational Genomics
(608) 262 - 9479	University of Wisconsin, Madison