HTCondor Project List Archives



[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-devel] Patches to src/condor_shadow.V6/pseudo_ops.C [3/3]



The final patch adds a test to see that the remote job is still alive
while we wait for the checkpoint server.  Otherwise, if we are evicted
forcibly while checkpointing, the shadow will wait pointlessly when it
could simply exit and make the job available to run on another host.

The method is to simply try to read from the remote job.  If it is
still alive the read will timeout, otherwise the connection is gone.

This does lead to many "condor_read(): timeout reading buffer." lines
in the ShadowLog, but it is much better than the current behavior.

--- src/condor_shadow.V6/pseudo_ops.C.NEW	Fri Dec 17 14:39:02 2004
+++ src/condor_shadow.V6/pseudo_ops.C	Wed Jun  8 13:37:22 2005
@@ -652,6 +652,29 @@
 	return -1;					// should never get here
 }
 
+bool
+lost_connection(unsigned int seconds)
+{
+	time_t now, then = time(0) + seconds;
+
+	while ((now = time(0)) < then) {
+		int timeout, peek;
+		char c = 0;
+		unsigned int nap;
+
+		timeout = syscall_sock->timeout(1);
+		peek = syscall_sock->peek(c);
+		syscall_sock->timeout(timeout);
+		if (peek) {
+			dprintf(D_ALWAYS, "peer went away (%d %d %d)\n", peek, c, timeout);
+			return true;
+		}
+		if ((nap = then - now) > 60) nap = 60;
+		sleep(nap);
+	}
+	return false;
+}
+
 /*
   Provide a process which will serve up the requested file as a
   stream.  The ip_addr and port number passed back are in host
@@ -711,7 +734,9 @@
 				}
 				dprintf(D_ALWAYS, "ckpt server restore failed, trying again"
 						" in %d seconds\n", retry_wait);
-				sleep(retry_wait);
+				if (lost_connection(retry_wait)) {
+					EXCEPT("ckpt server restore failed");
+				}
 				retry_wait += retry_step += 20;
 			}
 		} while (rval);
@@ -885,7 +910,9 @@
 				}
 				dprintf(D_ALWAYS, "store request to ckpt server failed, "
 						"trying again in %d seconds\n", retry_wait);
-				sleep(retry_wait);
+				if (lost_connection(retry_wait)) {
+					EXCEPT("ckpt server store failed");
+				}
 				retry_wait += retry_step += 20;
 			}
 		} while (rval);
@@ -2200,7 +2227,9 @@
 			}
 			dprintf(D_ALWAYS, "failed to contact ckpt server, trying again"
 					" in %d seconds\n", retry_wait);
-			sleep(retry_wait);
+			if (lost_connection(retry_wait)) {
+				EXCEPT("failed to contact ckpt server");
+			}
 			retry_wait += retry_step += 20;
 		}
 	} while(rval == -1 && LastCkptServer && accum_usage > MaxDiscardedRunTime);

-- 
Daniel K. Forrest	Laboratory for Molecular and
forrest@xxxxxxxxxxxxx	Computational Genomics
(608) 262 - 9479	University of Wisconsin, Madison