X-Git-Url: http://git.annexia.org/?p=whenjobs.git;a=blobdiff_plain;f=daemon%2Fdaemon.ml;h=ed5e067f802558f4ba3b55bbe677b23119156d6d;hp=fba3ae5114a76856dac3043ac06e302aa246604f;hb=57d21a4a716b513d93790262db4a663c169814bd;hpb=61cad7bbaf63389b520b695eefdd735bc11a8aa6 diff --git a/daemon/daemon.ml b/daemon/daemon.ml index fba3ae5..ed5e067 100644 --- a/daemon/daemon.ml +++ b/daemon/daemon.ml @@ -17,33 +17,46 @@ *) open Whenutils +open Whenexpr +open Big_int open Unix open Printf -(* All jobs that are loaded. Maps name -> [job] structure. *) -let jobs = ref StringMap.empty +(* See [exit.c]. *) +external _exit : int -> 'a = "whenjobs__exit" -(* Map variable names to jobs which depend on that variable. This - * gives us a quick way to tell which jobs might need to be reevaluated - * when a variable is set. - *) -let dependencies = ref StringMap.empty +(* $HOME/.whenjobs *) +let jobsdir = ref "" + +(* The state. *) +let state = ref Whenstate.empty -(* Current values of variables. Using the referentially transparent - * type Map is very useful here because it lets us cheaply keep - * previous values of variables. +(* Jobs that are running: a map of PID -> (job, tmpdir, serial, start_time). + * Note that the job may no longer exist *OR* it may have been renamed, + * eg. if the jobs file was reloaded. *) -let variables : variables ref = ref StringMap.empty +let runningmap = ref IntMap.empty -(* $HOME/.whenjobs *) -let jobsdir = ref "" +(* Serial numbers of running jobs. Map of serial -> PID (in runningmap). *) +let serialmap = ref BigIntMap.empty (* Was debugging requested on the command line? *) let debug = ref false +(* The server. *) +let server = ref None + let esys = Unixqueue.standard_event_system () +(* The timer. It's convenient to have this as a global variable + * because (a) there should only be one timer (which fires when the + * soonest every-job becomes ready), and (b) it's complicated to track + * that timer and avoid it getting double-scheduled (eg. when we + * reload the jobs file) without having a global variable. + *) +let timer_group = ref None + let rec init j d = jobsdir := j; debug := d; @@ -54,17 +67,29 @@ let rec init j d = let addr = sprintf "%s/socket" !jobsdir in (try unlink addr with Unix_error _ -> ()); - ignore ( + (* Create the Unix domain socket server. *) + server := Some ( Whenproto_srv.When.V1.create_server ~proc_reload_file ~proc_set_variable ~proc_get_variable ~proc_get_variable_names + ~proc_exit_daemon + ~proc_get_jobs + ~proc_cancel_job + ~proc_start_job + ~proc_get_job (Rpc_server.Unix addr) Rpc.Tcp (* not TCP, this is the same as SOCK_STREAM *) Rpc.Socket esys - ) + ); + + (* Handle SIGCHLD to clean up jobs. *) + Sys.set_signal Sys.sigchld (Sys.Signal_handle handle_sigchld); + + (* Initialize the variables. *) + state := Whenstate.set_variable !state "JOBSERIAL" (T_int zero_big_int) and proc_reload_file () = if !debug then Syslog.notice "remote call: reload_file"; @@ -75,43 +100,103 @@ and proc_reload_file () = and proc_set_variable (name, value) = if !debug then Syslog.notice "remote call: set_variable %s" name; - let value = variable_of_rpc value in - variables := StringMap.add name value !variables; + try + check_valid_variable_name name; - (* Which jobs need to be re-evaluated? *) - let jobnames = try StringMap.find name !dependencies with Not_found -> [] in - let jobs = reevaluate_jobs jobnames in - run_jobs jobs + let value = variable_of_rpc value in + state := Whenstate.set_variable !state name value; + + (* Which jobs need to be re-evaluated? *) + let jobs = Whenstate.get_dependencies !state [name] in + reevaluate_whenjobs jobs; + + `ok + with + Failure msg -> `error msg and proc_get_variable name = if !debug then Syslog.notice "remote call: get_variable %s" name; - try rpc_of_variable (StringMap.find name !variables) - with (* all non-existent variables are empty strings *) - Not_found -> `string_t "" + rpc_of_variable (Whenstate.get_variable !state name) and proc_get_variable_names () = if !debug then Syslog.notice "remote call: get_variable_names"; - (* Only return variables that are non-empty. *) - let vars = StringMap.fold ( - fun name value xs -> if value <> T_string "" then name :: xs else xs - ) !variables [] in + let vars = Whenstate.get_variable_names !state in + + (* Return variable names as a sorted array. *) let vars = Array.of_list vars in Array.sort compare vars; vars +and proc_exit_daemon () = + if !debug then Syslog.notice "remote call: exit_daemon"; + + match !server with + | None -> + `error "exit_daemon: no server handle" + | Some s -> + Rpc_server.stop_server ~graceful:true s; + server := None; + `ok + +and proc_get_jobs () = + let running = Array.of_list (IntMap.values !runningmap) in + Array.map ( + fun (job, dir, serial, start_time) -> + { Whenproto_aux.job_name = job.job_name; + job_serial = string_of_big_int serial; + job_tmpdir = dir; job_start_time = Int64.of_float start_time } + ) running + +and proc_cancel_job serial = + try + let serial = big_int_of_string serial in + let pid = BigIntMap.find serial !serialmap in + kill pid 15; + `ok + with + | Not_found -> `error "job not found" + | exn -> `error (Printexc.to_string exn) + +and proc_start_job jobname = + try + let job = Whenstate.get_job !state jobname in + run_job job; + `ok + with + | Not_found -> `error "job not found" + | exn -> `error (Printexc.to_string exn) + +and proc_get_job serial = + try + let serial = big_int_of_string serial in + let pid = BigIntMap.find serial !serialmap in + let job, dir, serial, start_time = IntMap.find pid !runningmap in + { Whenproto_aux.job_name = job.job_name; + job_serial = string_of_big_int serial; + job_tmpdir = dir; job_start_time = Int64.of_float start_time } + with + | Not_found -> failwith "job not found" + | exn -> failwith (Printexc.to_string exn) + (* Reload the jobs file. *) and reload_file () = let file = sprintf "%s/jobs.cmo" !jobsdir in - Whenfile.init (); - let js = + (* As we are reloading the file, we want to create a new state + * that has no jobs, but has all the variables from the previous + * state. + *) + let s = Whenstate.copy_variables !state Whenstate.empty in + Whenfile.init s; + + let s = try Dynlink.loadfile file; - let jobs = Whenfile.get_jobs () in - Syslog.notice "loaded %d job(s) from %s" (List.length jobs) file; - jobs + let s = Whenfile.get_state () in + Syslog.notice "loaded %d job(s) from %s" (Whenstate.nr_jobs s) file; + s with | Dynlink.Error err -> let err = Dynlink.error_message err in @@ -120,69 +205,294 @@ and reload_file () = | exn -> failwith (Printexc.to_string exn) in - (* Set 'jobs' and related global variables. *) - let () = - let map = List.fold_left ( - fun map j -> - let name = j.job_name in - StringMap.add name j map - ) StringMap.empty js in - jobs := map in - - let () = - let map = List.fold_left ( - fun map j -> - let deps = dependencies_of_job j in - let name = j.job_name in - List.fold_left ( - fun map d -> - let names = try StringMap.find d map with Not_found -> [] in - StringMap.add d (name :: names) map - ) map deps - ) StringMap.empty js in - dependencies := map in - - (* Re-evaluate all jobs. *) - let jobs = reevaluate_jobs (StringMap.keys !jobs) in - run_jobs jobs - -(* Re-evaluate each named job, in a loop until we reach a fixpoint. - * Return the names of all the jobs that need to be run. + let s = Whenstate.copy_prev_state !state s in + state := s; + + (* Re-evaluate all when jobs. *) + reevaluate_whenjobs ~onload:true (Whenstate.get_whenjobs !state); + + (* Schedule the next every job to run. *) + schedule_next_everyjob () + +(* Re-evaluate each when-statement job, in a loop until we reach + * a fixpoint. Run those that need to be run. *) -and reevaluate_jobs jobnames = - let rec loop set jobnames = +and reevaluate_whenjobs ?onload jobs = + let rec loop set jobs = let set' = List.fold_left ( - fun set jobname -> - let job = - try StringMap.find jobname !jobs - with Not_found -> assert false in - assert (jobname = job.job_name); + fun set job -> + let r, state' = + try Whenstate.evaluate_whenjob ?onload !state job + with Invalid_argument err | Failure err -> + Syslog.error "error evaluating job %s (at %s): %s" + job.job_name (Camlp4.PreCast.Ast.Loc.to_string job.job_loc) err; + false, !state in - let r, job' = job_evaluate job !variables in - jobs := StringMap.add jobname job' !jobs; + state := state'; if !debug then - Syslog.notice "evaluate %s -> %b\n" jobname r; + Syslog.notice "evaluate %s -> %b\n" job.job_name r; - if r then StringSet.add jobname set else set - ) set jobnames in + if r then StringSet.add job.job_name set else set + ) set jobs in if StringSet.compare set set' <> 0 then - loop set' jobnames + loop set' jobs else set' in - let set = loop StringSet.empty jobnames in - StringSet.elements set + let set = loop StringSet.empty jobs in + let jobnames = StringSet.elements set in + + (* Ensure the jobs always run in predictable (name) order. *) + let jobnames = List.sort compare_jobnames jobnames in -and run_jobs jobnames = - let run_job job = - Syslog.notice "running %s" job.job_name; - () (* XXX *) + (* Run the jobs. *) + List.iter run_job (List.map (Whenstate.get_job !state) jobnames) + +(* Schedule the next every-statement job to run, if there is one. We + * look at the every jobs, work out the time that each must run at, + * pick the job(s) which must run soonest, and schedule a timer to run + * them. When the timer fires, it runs those jobs, then calls this + * function again. + *) +and schedule_next_everyjob () = + let t = time () in + + (* Get only everyjobs. *) + let jobs = Whenstate.get_everyjobs !state in + let jobs = List.map ( + function + | { job_cond = Every_job period } as job -> (job, period) + | { job_cond = When_job _ } -> assert false + ) jobs in + + (* Map everyjob to next time it must run. *) + let jobs = List.map ( + fun (job, period) -> + let t' = next_periodexpr t period in + assert (t' > t); (* serious bug in next_periodexpr if false *) + job, t' + ) jobs in + + (* Sort, soonest first. *) + let jobs = List.sort (fun (_,a) (_,b) -> compare a b) jobs in + + if !debug then ( + List.iter ( + fun (job, t) -> + Syslog.notice "%s: next scheduled run at %s" + job.job_name (string_of_time_t t) + ) jobs + ); + + (* Pick the job(s) which run soonest. *) + let rec pick = function + | [] -> 0., [] + | [j, t] -> t, [j] + | (j1, t) :: (j2, t') :: _ when t < t' -> t, [j1] + | (j1, t) :: (((j2, t') :: _) as rest) -> t, (j1 :: snd (pick rest)) in + let t, jobs = pick jobs in + + if t > 0. then ( + if jobs <> [] then ( + (* Ensure the jobs always run in predictable (name) order. *) + let jobs = + List.sort (fun {job_name = a} {job_name = b} -> compare_jobnames a b) + jobs in + + if !debug then + Syslog.notice "scheduling job(s) %s to run at %s" + (String.concat ", " (List.map (fun { job_name = name } -> name) jobs)) + (string_of_time_t t); + + (* Schedule them to run at time t. *) + let g = new_timer_group () in + let t_diff = t -. Unix.time () in + let t_diff = if t_diff < 0. then 0. else t_diff in + let run_jobs () = + delete_timer_group (); (* Delete the timer. *) + List.iter run_job jobs; + schedule_next_everyjob () + in + Unixqueue.weak_once esys g t_diff run_jobs; + ) + ) - List.iter run_job - (List.map (fun jobname -> StringMap.find jobname !jobs) jobnames) +and new_timer_group () = + delete_timer_group (); + let g = Unixqueue.new_group esys in + timer_group := Some g; + g + +and delete_timer_group () = + match !timer_group with + | None -> () + | Some g -> + Unixqueue.clear esys g; + timer_group := None + +and run_job job = + (* Increment JOBSERIAL. *) + let serial = + match Whenstate.get_variable !state "JOBSERIAL" with + | T_int serial -> + let serial = succ_big_int serial in + state := Whenstate.set_variable !state "JOBSERIAL" (T_int serial); + serial + | _ -> assert false in + + (* Call the pre-condition script. Note this may decide not to run + * the job by returning false. + *) + let pre_condition () = + match job.job_pre with + | None -> true + | Some pre -> + let rs = ref [] in + IntMap.iter ( + fun pid (job, _, serial, start_time) -> + let r = { pirun_job_name = job.job_name; + pirun_serial = serial; + pirun_start_time = start_time; + pirun_pid = pid } in + rs := r :: !rs + ) !runningmap; + let preinfo = { + pi_job_name = job.job_name; + pi_serial = serial; + pi_variables = Whenstate.get_variables !state; + pi_running = !rs; + } in + pre preinfo + in + if pre_condition () then ( + Syslog.notice "running %s (JOBSERIAL=%s)" + job.job_name (string_of_big_int serial); + + (* Create a temporary directory. The current directory of the job + * will be in this directory. The directory is removed when the + * child process exits. + *) + let dir = tmpdir () in + + let pid = fork () in + if pid = 0 then ( (* child process running the job *) + chdir dir; + + (* Set environment variables corresponding to each variable. *) + List.iter + (fun (name, value) -> putenv name (string_of_variable value)) + (Whenstate.get_variables !state); + + (* Set the $JOBNAME environment variable. *) + putenv "JOBNAME" job.job_name; + + (* Create a temporary file containing the shell script fragment. *) + let script = dir // "script.sh" in + let chan = open_out script in + fprintf chan "set -e\n"; (* So that jobs exit on error. *) + output_string chan job.job_script.sh_script; + close_out chan; + chmod script 0o700; + + let shell = try getenv "SHELL" with Not_found -> "/bin/sh" in + + (* Set output to file. *) + let output = dir // "output.txt" in + let fd = openfile output [O_WRONLY; O_CREAT; O_TRUNC; O_NOCTTY] 0o600 in + dup2 fd stdout; + dup2 fd stderr; + close fd; + + (* Execute the shell script. *) + (try execvp shell [| shell; "-c"; script |]; + with Unix_error (err, fn, _) -> + Syslog.error "%s failed: %s: %s" fn script (error_message err) + ); + _exit 1 + ); + + (* Remember this PID, the job and the temporary directory, so we + * can clean up when the child exits. + *) + runningmap := IntMap.add pid (job, dir, serial, time ()) !runningmap; + serialmap := BigIntMap.add serial pid !serialmap + ) + else ( + Syslog.notice "not running %s (JOBSERIAL=%s) because pre() condition returned false" + job.job_name (string_of_big_int serial); + ) + +and tmpdir () = + let chan = open_in "/dev/urandom" in + let data = String.create 16 in + really_input chan data 0 (String.length data); + close_in chan; + let data = Digest.to_hex (Digest.string data) in + let dir = Filename.temp_dir_name // sprintf "whenjobs%s" data in + mkdir dir 0o700; + dir + +(* This is called when a job (child process) exits. *) +and handle_sigchld _ = + try + let pid, status = waitpid [WNOHANG] 0 in + if pid > 0 then ( + (* Look up the PID in the running jobs map. *) + let job, dir, serial, time = IntMap.find pid !runningmap in + runningmap := IntMap.remove pid !runningmap; + serialmap := BigIntMap.remove serial !serialmap; + post_job job dir serial time status + ) + with Unix_error _ | Not_found -> () + +and post_job job dir serial time status = + (* If there is a post function, run it. *) + (match job.job_post with + | None -> () + | Some post -> + let code = + match status with + | WEXITED c -> c + | WSIGNALED s | WSTOPPED s -> 1 in + let result = { + res_job_name = job.job_name; + res_serial = serial; + res_code = code; + res_tmpdir = dir; + res_output = dir // "output.txt"; + res_start_time = time + } in + try post result + with + | Failure msg -> + Syslog.error "job %s post function failed: %s" job.job_name msg + | exn -> + Syslog.error "job %s post function exception: %s" + job.job_name (Printexc.to_string exn) + ); + + (* This should be safe because the path cannot contain shell metachars. *) + let cmd = sprintf "rm -rf '%s'" dir in + ignore (Sys.command cmd) + +(* Intelligent comparison of job names. *) +and compare_jobnames name1 name2 = + try + let len1 = String.length name1 + and len2 = String.length name2 in + if len1 > 4 && len2 > 4 && + String.sub name1 0 4 = "job$" && String.sub name2 0 4 = "job$" + then ( + let i1 = int_of_string (String.sub name1 4 (len1-4)) in + let i2 = int_of_string (String.sub name2 4 (len2-4)) in + compare i1 i2 + ) + else raise Not_found + with _ -> + compare name1 name2 let main_loop () = Unixqueue.run esys