#!/usr/bin/python3

import htcondor
import os
import sys
from time import time

def error(msg):
    """Function to print error message and exit"""
    print(f"Error: {msg}", file=sys.stderr)
    sys.exit(1)

def main():
    """Main function for restarting a DAG currently running"""
    #Process Passed arguments
    prog_name = sys.argv[0]
    cluster_id = -1
    if len(sys.argv) == 1:
        error(f"{prog_name} expects a DAGMan Cluster ID")
    elif len(sys.argv) > 2:
        error(f"{prog_name} was given to many arguments. Only expects one Cluster ID")
    else:
        try:
            cluster_id = int(sys.argv[1])
        except Exception as e:
            error(f"invalid Cluster ID | {e}")

    #Query the local Schedd for job information
    schedd = htcondor.Schedd()
    ads = schedd.query(constraint=f"ClusterId=={cluster_id}", projection=["UserLog", "DAG_NodesFailed"])
    if len(ads) == 0:
        print(f"Job {cluster_id} is not in the queue")
        sys.exit(0)
    if len(ads) > 1:
        error(f"Schedd returned {len(ads)} job ads. Only expected one")
    dag_ad = ads[0]
    if "DAG_NodesFailed" not in dag_ad:
        print(f"Job {cluster_id} is not a DAGMan job")
        sys.exit(0)
    elif "UserLog" not in dag_ad:
        error("Invalid job ad returned. Missing UserLog")

    #Remove the parent DAGMan job
    print(f"Removing DAG ({cluster_id})")
    schedd.act(htcondor.JobAction.Remove,
               job_spec=f"ClusterId=={cluster_id}",
               reason=f"Restarting DAGMan via {prog_name}")

    #Wait for the DAGMan job event log to verify job removal
    TIMEOUT = 90
    start_t = time()
    jel = htcondor.JobEventLog(dag_ad["UserLog"])
    for event in jel.events(stop_after=None):
        if event.cluster != cluster_id:
            continue
        if event.type == htcondor.JobEventType.JOB_TERMINATED:
            if event["TerminatedNormally"]:
                print(f"DAG ({cluster_id}) exited normally on its own accord. Not restarting")
                sys.exit(0)
            else:
                print(f"DAG ({cluster_id}) terminated unsuccessfully. Restarting")
                break
        elif event.type == htcondor.JobEventType.JOB_ABORTED:
            break

        if time() - start_t > TIMEOUT:
            error(f"Failed to read remove event from DAGMan job log within {TIMEOUT} seconds")

    #Process UserLog for directory path and dag filename
    dir_path, dag_file = dag_ad["UserLog"].rsplit("/", 1)
    dag_file = dag_file.replace(".dagman.log", "")
    os.chdir(dir_path)

    #Resubmit the DAG job
    print(f"Resubmitting {dag_file}")
    try:
        dag_desc = htcondor.Submit().from_dag(dag_file)
        info = schedd.submit(dag_desc)
    except Exception as e:
        error(f"Failed to resubmit {dag_file} | {e}")
    print(f"Successfully resubmitted {dag_file} with Cluster ID {info.cluster()}")

if __name__ == "__main__":
    main()