(* whenjobs * Copyright (C) 2012 Red Hat Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. *) open Whenutils open Whenexpr open Big_int open Unix open Printf (* See [exit.c]. *) external _exit : int -> 'a = "whenjobs__exit" (* $HOME/.whenjobs *) let jobsdir = ref "" (* The state. * * Note that whenever this is updated, you need to consider if you * should call 'save_variables ()' (which persists the variables to a * file). XXX We should replace this ref with an accessor * function. *) let state = ref Whenstate.empty (* Format used to save variables. Note we can't allow any internal * types to "escape" into this definition, else the file format will * change when parts of the program change. *) type variables_file_v1 = (string * variable_v1) list and variable_v1 = | Vv1_unit | Vv1_bool of bool | Vv1_string of string | Vv1_int of big_int | Vv1_float of float let variable_of_variable_v1 = function | Vv1_unit -> T_unit | Vv1_bool b -> T_bool b | Vv1_string s -> T_string s | Vv1_int i -> T_int i | Vv1_float f -> T_float f let variable_v1_of_variable = function | T_unit -> Vv1_unit | T_bool b -> Vv1_bool b | T_string s -> Vv1_string s | T_int i -> Vv1_int i | T_float f -> Vv1_float f (* Jobs that are running: a map of PID -> (job, tmpdir, serial, start_time). * Note that the job may no longer exist *OR* it may have been renamed, * eg. if the jobs file was reloaded. *) let runningmap = ref IntMap.empty (* Serial numbers of running jobs. Map of serial -> PID (in runningmap). *) let serialmap = ref BigIntMap.empty (* Was debugging requested on the command line? *) let debug = ref false (* The server. *) let server = ref None let esys = Unixqueue.standard_event_system () (* The timer. It's convenient to have this as a global variable * because (a) there should only be one timer (which fires when the * soonest every-job becomes ready), and (b) it's complicated to track * that timer and avoid it getting double-scheduled (eg. when we * reload the jobs file) without having a global variable. *) let timer_group = ref None let rec init j d = jobsdir := j; debug := d; Whenlock.create_lock !jobsdir; (* Remove old socket if it exists. *) let addr = sprintf "%s/socket" !jobsdir in (try unlink addr with Unix_error _ -> ()); (* Create the Unix domain socket server. *) server := Some ( Whenproto_srv.When.V1.create_server ~proc_reload_file ~proc_set_variable ~proc_get_variable ~proc_get_variable_names ~proc_exit_daemon ~proc_get_jobs ~proc_cancel_job ~proc_start_job ~proc_get_job ~proc_set_variables ~proc_get_job_names ~proc_test_variables ~proc_ping_daemon ~proc_whisper_variables (Rpc_server.Unix addr) Rpc.Tcp (* not TCP, this is the same as SOCK_STREAM *) Rpc.Socket esys ); (* Handle SIGCHLD to clean up jobs. *) Sys.set_signal Sys.sigchld (Sys.Signal_handle handle_sigchld); (* Load or initialize the variables. *) let variables_file = sprintf "%s/variables" !jobsdir in state := try let chan = open_in variables_file in let r = load_variables !state chan in close_in chan; r with | Sys_error _ -> Whenstate.set_variable !state "JOBSERIAL" (T_int zero_big_int) (* Try to load the variables from the file. If the file exists and * cannot be read, raise an exception. *) and load_variables state chan = let signature = input_line chan in if signature = "WHENJOBS VARIABLES VERSION 1" then ( let variables : variables_file_v1 = input_value chan in List.fold_left ( fun state (n, v) -> Whenstate.set_variable state n (variable_of_variable_v1 v) ) state variables ) else (* in future, other signatures, but for now ... *) failwith (sprintf "cannot read variables file: invalid signature: %s" signature) and save_variables () = let variables_file = sprintf "%s/variables" !jobsdir in let new_file = variables_file ^ ".new" in let chan = open_out new_file in fprintf chan "WHENJOBS VARIABLES VERSION 1\n"; let variables = Whenstate.get_variables !state in let variables = List.map (fun (n, v) -> n, variable_v1_of_variable v) variables in output_value chan variables; (* Try to arrange that the new file is updated atomically. *) flush chan; Netsys_posix.fsync (descr_of_out_channel chan); close_out chan; rename new_file variables_file and proc_reload_file () = if !debug then Syslog.notice "remote call: reload_file"; try reload_files (); `ok with Failure err -> `error err and proc_set_variable (name, value) = if !debug then Syslog.notice "remote call: set_variable %s" name; try check_valid_variable_name name; let value = variable_of_rpc value in state := Whenstate.set_variable !state name value; (* Which jobs need to be re-evaluated? *) let jobs = Whenstate.get_dependencies !state [name] in let jobnames, state' = reevaluate_whenjobs !state jobs in let state' = run_whenjobs state' jobnames in state := state'; save_variables (); `ok with Failure msg -> `error msg and proc_get_variable name = if !debug then Syslog.notice "remote call: get_variable %s" name; rpc_of_variable (Whenstate.get_variable !state name) and proc_get_variable_names () = if !debug then Syslog.notice "remote call: get_variable_names"; let vars = Whenstate.get_variable_names !state in (* Return variable names as a sorted array. *) let vars = Array.of_list vars in Array.sort compare vars; vars and proc_exit_daemon () = if !debug then Syslog.notice "remote call: exit_daemon"; match !server with | None -> `error "exit_daemon: no server handle" | Some s -> Rpc_server.stop_server ~graceful:true s; server := None; Gc.compact (); (* force the server handle to get cleaned up now *) `ok and proc_get_jobs () = let running = Array.of_list (IntMap.values !runningmap) in Array.map ( fun (job, dir, serial, start_time) -> { Whenproto_aux.job_name = job.job_name; job_serial = string_of_big_int serial; job_tmpdir = dir; job_start_time = Int64.of_float start_time } ) running and proc_cancel_job serial = try let serial = big_int_of_string serial in let pid = BigIntMap.find serial !serialmap in kill pid 15; `ok with | Not_found -> `error "job not found" | exn -> `error (Printexc.to_string exn) and proc_start_job jobname = try let job = Whenstate.get_job !state jobname in let state' = run_job !state job in state := state'; save_variables (); `ok with | Not_found -> `error "job not found" | exn -> `error (Printexc.to_string exn) and proc_get_job serial = try let serial = big_int_of_string serial in let pid = BigIntMap.find serial !serialmap in let job, dir, serial, start_time = IntMap.find pid !runningmap in { Whenproto_aux.job_name = job.job_name; job_serial = string_of_big_int serial; job_tmpdir = dir; job_start_time = Int64.of_float start_time } with | Not_found -> failwith "job not found" | exn -> failwith (Printexc.to_string exn) and proc_set_variables vars = try let vars = Array.map ( fun { Whenproto_aux.sv_name = name; sv_value = value } -> name, variable_of_rpc value ) vars in let vars = Array.to_list vars in if !debug then Syslog.notice "remote call: set_variables (%s)" (String.concat " " (List.map ( fun (name, value) -> sprintf "%s=%s" name (string_of_variable value) ) vars)); List.iter (fun (name, _) -> check_valid_variable_name name) vars; (* Update all the variables atomically. *) let s = List.fold_left ( fun s (name, value) -> Whenstate.set_variable s name value ) !state vars in state := s; (* Which jobs need to be re-evaluated? *) let jobs = Whenstate.get_dependencies !state (List.map fst vars) in let jobnames, state' = reevaluate_whenjobs !state jobs in let state' = run_whenjobs state' jobnames in state := state'; save_variables (); `ok with Failure msg -> `error msg and proc_get_job_names () = Array.of_list (Whenstate.get_job_names !state) and proc_test_variables vars = (* This is the same as proc_set_variables, except that it doesn't * update the state, it just returns the jobs that *would* run if * these variables were set to these values. *) let vars = Array.map ( fun { Whenproto_aux.sv_name = name; sv_value = value } -> name, variable_of_rpc value ) vars in let vars = Array.to_list vars in if !debug then Syslog.notice "remote call: test_variables (%s)" (String.concat " " (List.map ( fun (name, value) -> sprintf "%s=%s" name (string_of_variable value) ) vars)); List.iter (fun (name, _) -> check_valid_variable_name name) vars; (* Update all the variables atomically. *) let state = List.fold_left ( fun s (name, value) -> Whenstate.set_variable s name value ) !state vars in (* Which jobs WOULD be re-evaluated? *) let jobs = Whenstate.get_dependencies state (List.map fst vars) in let jobnames, _ = reevaluate_whenjobs state jobs in (* Return the names. *) Array.of_list jobnames and proc_ping_daemon () = `ok and proc_whisper_variables vars = try let vars = Array.map ( fun { Whenproto_aux.sv_name = name; sv_value = value } -> name, variable_of_rpc value ) vars in let vars = Array.to_list vars in if !debug then Syslog.notice "remote call: whisper_variables (%s)" (String.concat " " (List.map ( fun (name, value) -> sprintf "%s=%s" name (string_of_variable value) ) vars)); List.iter (fun (name, _) -> check_valid_variable_name name) vars; (* Update all the variables atomically. *) let s = List.fold_left ( fun s (name, value) -> Whenstate.set_variable s name value ) !state vars in state := s; save_variables (); (* .. but don't reevaluate or run jobs. *) `ok with Failure msg -> `error msg (* Reload the jobs file(s). *) and reload_files () = (* Get the highest numbered dir/jobs__*.cmo (bytecode) or * dir/jobs__*.cmxs (native code) file and load it. Delete * lower-numbered (== older) files. *) let filename = let suffix, slen = if not Dynlink.is_native then ".cmo", 4 else ".cmxs", 5 in let dir = !jobsdir in let files = Array.to_list (Sys.readdir dir) in let times = filter_map ( fun file -> if not (string_startswith file "jobs__") || not (string_endswith file suffix) then None else ( let len = String.length file in let t = String.sub file 6 (len-slen-6) in (* Use int64 because t won't necessarily fit into 31 bit int. *) try Some (Int64.of_string t) with Failure "int_of_string" -> assert false ) ) files in let times = List.rev (List.sort compare times) in match times with | [] -> None | x::xs -> (* Unlink the older files. *) List.iter ( fun t -> try unlink (dir // sprintf "jobs__%Ld%s" t suffix) with Unix_error _ -> () ) xs; (* Return the newest (highest numbered) file. *) Some (dir // sprintf "jobs__%Ld%s" x suffix) in (* As we are reloading the file, we want to create a new state * that has no jobs, but has all the variables from the previous * state. *) let s = Whenstate.copy_variables !state Whenstate.empty in Whenfile.init s; let s = match filename with | None -> (* no jobs file, return the same state *) Syslog.notice "no jobs file found"; s | Some filename -> try Dynlink.loadfile filename; let s = Whenfile.get_state () in Syslog.notice "loaded %d job(s)" (Whenstate.nr_jobs s); s with | Dynlink.Error err -> let err = Dynlink.error_message err in Syslog.error "error loading jobs: %s" err; failwith err | exn -> failwith (Printexc.to_string exn) in let s = Whenstate.copy_prev_state !state s in state := s; (* Re-evaluate all when jobs. *) let jobs = Whenstate.get_whenjobs !state in let jobnames, state' = reevaluate_whenjobs ~onload:true !state jobs in let state' = run_whenjobs state' jobnames in state := state'; save_variables (); (* Schedule the next every job to run. *) schedule_next_everyjob () (* Re-evaluate each when-statement job, in a loop until we reach * a fixpoint. Return the list of job names that should run and * the updated state. *) and reevaluate_whenjobs ?onload state jobs = let rec loop (set, state) jobs = let set', state' = List.fold_left ( fun (set, state) job -> let r, state' = try Whenstate.evaluate_whenjob ?onload state job with Invalid_argument err | Failure err -> Syslog.error "error evaluating job %s (at %s): %s" job.job_name (Camlp4.PreCast.Ast.Loc.to_string job.job_loc) err; false, state in if !debug then Syslog.notice "evaluate %s -> %b\n" job.job_name r; (if r then StringSet.add job.job_name set else set), state' ) (set, state) jobs in (* reached a fixpoint? *) if StringSet.compare set set' <> 0 then loop (set', state') jobs else (set', state') in let set, state = loop (StringSet.empty, state) jobs in let jobnames = StringSet.elements set in (* Ensure the jobs always run in predictable (name) order. *) let jobnames = List.sort compare_jobnames jobnames in jobnames, state and run_whenjobs state jobnames = (* Run the jobs. *) let jobs = List.map (Whenstate.get_job state) jobnames in List.fold_left run_job state jobs (* Schedule the next every-statement job to run, if there is one. We * look at the every jobs, work out the time that each must run at, * pick the job(s) which must run soonest, and schedule a timer to run * them. When the timer fires, it runs those jobs, then calls this * function again. *) and schedule_next_everyjob () = let t = time () in (* Get only everyjobs. *) let jobs = Whenstate.get_everyjobs !state in let jobs = List.map ( function | { job_cond = Every_job period } as job -> (job, period) | { job_cond = When_job _ } -> assert false ) jobs in (* Map everyjob to next time it must run. *) let jobs = List.map ( fun (job, period) -> let t' = next_periodexpr t period in assert (t' > t); (* serious bug in next_periodexpr if false *) job, t' ) jobs in (* Sort, soonest first. *) let jobs = List.sort (fun (_,a) (_,b) -> compare a b) jobs in if !debug then ( List.iter ( fun (job, t) -> Syslog.notice "%s: next scheduled run at %s" job.job_name (string_of_time_t t) ) jobs ); (* Pick the job(s) which run soonest. *) let rec pick = function | [] -> 0., [] | [j, t] -> t, [j] | (j1, t) :: (j2, t') :: _ when t < t' -> t, [j1] | (j1, t) :: (((j2, t') :: _) as rest) -> t, (j1 :: snd (pick rest)) in let t, jobs = pick jobs in if t > 0. then ( if jobs <> [] then ( (* Ensure the jobs always run in predictable (name) order. *) let jobs = List.sort (fun {job_name = a} {job_name = b} -> compare_jobnames a b) jobs in if !debug then Syslog.notice "scheduling job(s) %s to run at %s" (String.concat ", " (List.map (fun { job_name = name } -> name) jobs)) (string_of_time_t t); (* Schedule them to run at time t. *) let g = new_timer_group () in let t_diff = t -. Unix.time () in let t_diff = if t_diff < 0. then 0. else t_diff in let run_jobs () = delete_timer_group (); (* Delete the timer. *) let state' = List.fold_left run_job !state jobs in state := state'; save_variables (); schedule_next_everyjob () in Unixqueue.weak_once esys g t_diff run_jobs; ) ) and new_timer_group () = delete_timer_group (); let g = Unixqueue.new_group esys in timer_group := Some g; g and delete_timer_group () = match !timer_group with | None -> () | Some g -> Unixqueue.clear esys g; timer_group := None and run_job state job = (* Increment JOBSERIAL. *) let serial, state = match Whenstate.get_variable state "JOBSERIAL" with | T_int serial -> let serial = succ_big_int serial in let state' = Whenstate.set_variable state "JOBSERIAL" (T_int serial) in serial, state' | _ -> assert false in (* Call the pre-condition script. Note this may decide not to run * the job by returning false. *) let pre_condition () = match job.job_pre with | None -> true | Some pre -> let rs = ref [] in IntMap.iter ( fun pid (job, _, serial, start_time) -> let r = { pirun_job_name = job.job_name; pirun_serial = serial; pirun_start_time = start_time; pirun_pid = pid } in rs := r :: !rs ) !runningmap; let preinfo = { pi_job_name = job.job_name; pi_serial = serial; pi_variables = Whenstate.get_variables state; pi_running = !rs; } in pre preinfo in if pre_condition () then ( Syslog.notice "running %s (JOBSERIAL=%s)" job.job_name (string_of_big_int serial); (* Create a temporary directory. The current directory of the job * will be in this directory. The directory is removed when the * child process exits. *) let dir = tmpdir () in let pid = fork () in if pid = 0 then ( (* child process running the job *) chdir dir; (* Set environment variables corresponding to each variable. *) List.iter (fun (name, value) -> putenv name (string_of_variable value)) (Whenstate.get_variables state); (* Set the $JOBNAME environment variable. *) putenv "JOBNAME" job.job_name; (* Create a temporary file containing the shell script fragment. *) let script = dir // "script.sh" in let chan = open_out script in fprintf chan "set -e\n"; (* So that jobs exit on error. *) output_string chan job.job_script.sh_script; close_out chan; chmod script 0o700; let shell = try getenv "SHELL" with Not_found -> "/bin/sh" in (* Set output to file. *) let output = dir // "output.txt" in let fd = openfile output [O_WRONLY; O_CREAT; O_TRUNC; O_NOCTTY] 0o600 in dup2 fd stdout; dup2 fd stderr; close fd; (* Execute the shell script. *) (try execvp shell [| shell; "-c"; script |]; with Unix_error (err, fn, _) -> Syslog.error "%s failed: %s: %s" fn script (error_message err) ); _exit 1 ); (* Remember this PID, the job and the temporary directory, so we * can clean up when the child exits. *) runningmap := IntMap.add pid (job, dir, serial, time ()) !runningmap; serialmap := BigIntMap.add serial pid !serialmap; state ) else ( Syslog.notice "not running %s (JOBSERIAL=%s) because pre() condition returned false" job.job_name (string_of_big_int serial); state ) and tmpdir () = let chan = open_in "/dev/urandom" in let data = String.create 16 in really_input chan data 0 (String.length data); close_in chan; let data = Digest.to_hex (Digest.string data) in let dir = Filename.temp_dir_name // sprintf "whenjobs%s" data in mkdir dir 0o700; dir (* This is called when a job (child process) exits. *) and handle_sigchld _ = try let pid, status = waitpid [WNOHANG] 0 in if pid > 0 then ( (* Look up the PID in the running jobs map. *) let job, dir, serial, time = IntMap.find pid !runningmap in runningmap := IntMap.remove pid !runningmap; serialmap := BigIntMap.remove serial !serialmap; post_job job dir serial time status ) with Unix_error _ | Not_found -> () and post_job job dir serial time status = (* If there is a post function, run it. *) (match job.job_post with | None -> () | Some post -> let code = match status with | WEXITED c -> c | WSIGNALED s | WSTOPPED s -> 1 in let result = { res_job_name = job.job_name; res_serial = serial; res_code = code; res_tmpdir = dir; res_output = dir // "output.txt"; res_start_time = time } in try post result with | Failure msg -> Syslog.error "job %s post function failed: %s" job.job_name msg | exn -> Syslog.error "job %s post function exception: %s" job.job_name (Printexc.to_string exn) ); (* This should be safe because the path cannot contain shell metachars. *) let cmd = sprintf "rm -rf '%s'" dir in ignore (Sys.command cmd) (* Intelligent comparison of job names. *) and compare_jobnames name1 name2 = try let len1 = String.length name1 and len2 = String.length name2 in if len1 > 4 && len2 > 4 && String.sub name1 0 4 = "job$" && String.sub name2 0 4 = "job$" then ( let i1 = int_of_string (String.sub name1 4 (len1-4)) in let i2 = int_of_string (String.sub name2 4 (len2-4)) in compare i1 i2 ) else raise Not_found with _ -> compare name1 name2 let main_loop () = Unixqueue.run esys