(* Handy tool for managing CSV files. * $Id: csvtool.ml,v 1.6 2006-11-24 10:09:56 rich Exp $ *) open Printf open Csv (*------------------------------ start of code from extlib *) exception Invalid_string let find str sub = let sublen = String.length sub in if sublen = 0 then 0 else let found = ref 0 in let len = String.length str in try for i = 0 to len - sublen do let j = ref 0 in while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do incr j; if !j = sublen then begin found := i; raise Exit; end; done; done; raise Invalid_string with Exit -> !found let split str sep = let p = find str sep in let len = String.length sep in let slen = String.length str in String.sub str 0 p, String.sub str (p + len) (slen - p - len) let nsplit str sep = if str = "" then [] else ( let rec nsplit str sep = try let s1 , s2 = split str sep in s1 :: nsplit s2 sep with Invalid_string -> [str] in nsplit str sep ) type 'a mut_list = { hd: 'a; mutable tl: 'a list } external inj : 'a mut_list -> 'a list = "%identity" let dummy_node () = { hd = Obj.magic (); tl = [] } let rec drop n = function | _ :: l when n > 0 -> drop (n-1) l | l -> l let take n l = let rec loop n dst = function | h :: t when n > 0 -> let r = { hd = h; tl = [] } in dst.tl <- inj r; loop (n-1) r t | _ -> () in let dummy = dummy_node() in loop n dummy l; dummy.tl (*------------------------------ end of extlib code *) (* Parse column specs. *) type colspec = range list and range = | Col of int (* 0 *) | Range of int * int (* 2-5 *) | ToEnd of int (* 7- *) let parse_colspec ~count_zero colspec = let cols = nsplit colspec "," in let cols = List.map ( fun col -> try (try let first, second = split col "-" in if second <> "" then Range (int_of_string first, int_of_string second) else ToEnd (int_of_string first) with Invalid_string -> Col (int_of_string col) ) with Failure "int_of_string" -> failwith (colspec ^ ":" ^ col ^ ": invalid column-spec") ) cols in (* Adjust so columns always count from zero. *) if not count_zero then List.map ( function | Col c -> Col (c-1) | Range (s, e) -> Range (s-1, e-1) | ToEnd e -> ToEnd (e-1) ) cols else cols let rec width_of_colspec = function | [] -> 0 | Col c :: rest -> 1 + width_of_colspec rest | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest | ToEnd _ :: _ -> failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')" (* For closed column specs, this preserves the correct width in the * result. *) let cols_of_colspec colspec row = let rec loop = function | [] -> [] | Col c :: rest -> (try List.nth row c with Failure "nth" -> "") :: loop rest | Range (s, e) :: rest -> let width = e-s+1 in let range = take width (drop s row) in let range = List.hd (set_columns width [range]) in List.append range (loop rest) | ToEnd e :: rest -> List.append (drop e row) (loop rest) in loop colspec (* The actual commands. *) let cmd_cols ~input_sep ~output_sep ~chan colspec files = List.iter ( fun filename -> let csv = load ~separator:input_sep filename in let csv = List.map (cols_of_colspec colspec) csv in save_out ~separator:output_sep chan csv ) files let cmd_namedcols ~input_sep ~output_sep ~chan names files = List.iter ( fun filename -> let csv = load ~separator:input_sep filename in let header, data = match csv with | [] -> failwith "no rows in this CSV file" | h :: t -> h, t in let data = associate header data in let data = List.map ( fun row -> List.map (fun name -> List.assoc name row) names ) data in save_out ~separator:output_sep chan data ) files let cmd_width ~input_sep ~chan files = let width = List.fold_left ( fun width filename -> let csv = load ~separator:input_sep filename in let width = max width (columns csv) in width ) 0 files in fprintf chan "%d\n" width let cmd_height ~input_sep ~chan files = let height = List.fold_left ( fun height filename -> let csv = load ~separator:input_sep filename in let height = height + lines csv in height ) 0 files in fprintf chan "%d\n" height let cmd_readable ~input_sep ~chan files = let csv = List.concat (List.map (load ~separator:input_sep) files) in save_out_readable chan csv let cmd_cat ~input_sep ~output_sep ~chan files = (* Avoid loading the whole file into memory. *) let f row = save_out ~separator:output_sep chan [row] in List.iter ( fun filename -> let in_chan = open_in filename in load_rows ~separator:input_sep f in_chan; close_in in_chan ) files let cmd_square ~input_sep ~output_sep ~chan files = let csv = List.concat (List.map (load ~separator:input_sep) files) in let csv = square csv in save_out ~separator:output_sep chan csv let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files = let csv = List.concat (List.map (load ~separator:input_sep) files) in let csv = sub r c rows cols csv in save_out ~separator:output_sep chan csv let cmd_replace ~input_sep ~output_sep ~chan colspec update files = let csv = List.concat (List.map (load ~separator:input_sep) files) in (* Load the update CSV file in. *) let update = Csv.load ~separator:input_sep update in (* Compare two rows for equality by considering only the columns * in colspec. *) let equal row1 row2 = let row1 = cols_of_colspec colspec row1 in let row2 = cols_of_colspec colspec row2 in 0 = Csv.compare [row1] [row2] in (* Look for rows in the original to be replaced by rows from the * update file. This is an ugly O(n^2) hack (XXX). *) let csv = List.filter ( fun row -> not (List.exists (equal row) update) ) csv in let csv = csv @ update in save_out ~separator:output_sep chan csv let rec uniq = function | [] -> [] | [x] -> [x] | x :: y :: xs when Pervasives.compare x y = 0 -> uniq (x :: xs) | x :: y :: xs -> x :: uniq (y :: xs) let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files = (* Load in the files separately. *) let csvs = List.map (load ~separator:input_sep) files in (* For each CSV file, construct a hash table from row class (key) to * the (possibly empty) output columns (values). * Also construct a hash which has the unique list of row classes. *) let keys = Hashtbl.create 1023 in let hashes = List.map ( fun csv -> let hash = Hashtbl.create 1023 in List.iter ( fun row -> let key = cols_of_colspec colspec1 row in let value = cols_of_colspec colspec2 row in if not (Hashtbl.mem keys key) then Hashtbl.add keys key true; Hashtbl.add hash key value ) csv; hash ) csvs in (* Get the keys. *) let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in let value_width = width_of_colspec colspec2 in let empty_value = List.hd (set_columns value_width [[""]]) in let multiple_values = List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in (* Generate output CSV. *) let keys = List.sort Pervasives.compare keys in let keys = List.map (fun key -> key, []) keys in let csv = List.fold_left ( fun keys hash -> List.map ( fun (key, values) -> let value = try Hashtbl.find_all hash key with Not_found -> [] in let value = match value with | [] -> empty_value | [value] -> value | _::_ -> multiple_values in key, (value :: values) ) keys ) keys hashes in let csv = List.map ( fun (key, values) -> key @ List.flatten (List.rev values) ) csv in save_out ~separator:output_sep chan csv (* Process the arguments. *) let usage = "csvtool - Copyright (C) 2005-2006 Richard W.M. Jones, Merjis Ltd. csvtool is a tool for performing manipulations on CSV files from shell scripts. Summary: csvtool [-options] command [command-args] input.csv [input2.csv [...]] Commands: col Return one or more columns from the CSV file. For , see below. Example: csvtool col 1-3,6 input.csv > output.csv namedcol Assuming the first row of the CSV file is a list of column headings, this returned the column(s) with the named headings. is a comma-separated list of names. Example: csvtool namedcol Account,Cost input.csv > output.csv width Print the maximum width of the CSV file (number of columns in the widest row). height Print the number of rows in the CSV file. readable Print the input CSV in a readable format. cat This concatenates the input files together and writes them to the output. You can use this to change the separator character. Example: csvtool -t TAB -u , cat input.tsv > output.csv join Join (collate) multiple CSV files together. controls which columns are compared. controls which columns are copied into the new file. Example: csvtool join 1 2 coll1.csv coll2.csv If coll1.csv contains: Computers,$40 Software,$100 and coll2.csv contains: Computers,$50 then the output will be: Computers,$40,$50 Software,$100, square Make the CSV square, so all rows have the same length. sub r c rows cols Take a square subset of the CSV, top left at row r, column c (counting from 0), which is rows deep and cols wide. replace update.csv original.csv Replace rows in original.csv with rows from update.csv. The columns in only are used to compare rows in input.csv and update.csv to see if they are candidates for replacement. Example: csvtool replace 3 updates.csv original.csv > new.csv mv new.csv original.csv Column specs: A is a comma-separated list of column numbers or column ranges. Examples: 1 Column 1 (the first, leftmost column) 2,5,7 Columns 2, 5 and 7 1-3,5 Columns 1, 2, 3 and 5 1,5- Columns 1, 5 and up. Columns are numbered starting from 1 unless the -z option is given. Input files: csvtool takes a list of input file(s) from the command line. Output file: Normally the output is written to stdout. Use the -o option to override this. Separators: The default separator character is , (comma). To change this on input or output see the -t and -u options respectively. Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated files. Options:" let () = let input_sep = ref ',' in let set_input_sep = function | "TAB" -> input_sep := '\t' | "COMMA" -> input_sep := ',' | "SPACE" -> input_sep := ' ' | s -> input_sep := s.[0] in let output_sep = ref ',' in let set_output_sep = function | "TAB" -> output_sep := '\t' | "COMMA" -> output_sep := ',' | "SPACE" -> output_sep := ' ' | s -> output_sep := s.[0] in let count_zero = ref false in let output_file = ref "" in let argspec = [ "-t", Arg.String set_input_sep, "Input separator char. Use -t TAB for tab separated input."; "-u", Arg.String set_output_sep, "Output separator char. Use -u TAB for tab separated output."; "-o", Arg.Set_string output_file, "Write output to file (instead of stdout)"; "-z", Arg.Set count_zero, "Number columns from 0 instead of 1"; ] in let rest = ref [] in let set_rest str = rest := str :: !rest in Arg.parse argspec set_rest usage; let input_sep = !input_sep in let output_sep = !output_sep in let count_zero = !count_zero in let output_file = !output_file in let rest = List.rev !rest in (* Set up the output file. *) let chan = if output_file <> "" then open_out output_file else stdout in (match rest with | ("col"|"cols") :: colspec :: files -> let colspec = parse_colspec ~count_zero colspec in cmd_cols ~input_sep ~output_sep ~chan colspec files | ("namedcol"|"namedcols") :: names :: files -> let names = nsplit names "," in cmd_namedcols ~input_sep ~output_sep ~chan names files | "width" :: files -> cmd_width ~input_sep ~chan files | "height" :: files -> cmd_height ~input_sep ~chan files | "readable" :: files -> cmd_readable ~input_sep ~chan files | ("cat"|"concat") :: files -> cmd_cat ~input_sep ~output_sep ~chan files | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) -> let colspec1 = parse_colspec ~count_zero colspec1 in let colspec2 = parse_colspec ~count_zero colspec2 in cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files | "square" :: files -> cmd_square ~input_sep ~output_sep ~chan files | "sub" :: r :: c :: rows :: cols :: files -> let r = int_of_string r in let c = int_of_string c in let rows = int_of_string rows in let cols = int_of_string cols in cmd_sub ~input_sep ~output_sep ~chan r c rows cols files | "replace" :: colspec :: update :: files -> let colspec = parse_colspec ~count_zero colspec in cmd_replace ~input_sep ~output_sep ~chan colspec update files | _ -> prerr_endline (Sys.executable_name ^ " --help for usage"); exit 2 ); if output_file <> "" then close_out chan