From: rich Date: Tue, 24 Oct 2006 10:06:01 +0000 (+0000) Subject: Csvtool: X-Git-Url: http://git.annexia.org/?a=commitdiff_plain;h=8a84862a75870ad54780fedfeff5fecc64dfa901;p=ocaml-csv.git Csvtool: - rationalised handling of parameters throughout - added "join" operation for JC. --- diff --git a/csvtool.ml b/csvtool.ml index 8ad7ee9..d594ea6 100644 --- a/csvtool.ml +++ b/csvtool.ml @@ -1,94 +1,293 @@ (* Handy tool for managing CSV files. - * $Id: csvtool.ml,v 1.3 2006-06-06 16:01:52 rich Exp $ + * $Id: csvtool.ml,v 1.4 2006-10-24 10:06:01 rich Exp $ *) open Printf open Csv -let cmd_cols ~separator ~csv ~chan cols = - let cols = List.map int_of_string cols in - - let output = List.map ( - fun row -> - let n = List.length row in - let row = List.map ( - fun col_wanted -> - if 0 <= col_wanted && col_wanted < n then - List.nth row col_wanted - else - "" - ) cols in - row - ) csv in - save_out ~separator chan output - -let cmd_namedcols ~separator ~csv ~chan names = - let header, data = - match csv with - | [] -> failwith "no rows in this CSV file" - | h :: t -> h, t in - let data = associate header data in - let data = List.map ( - fun row -> List.map (fun name -> List.assoc name row) names - ) data in - save_out ~separator chan data - -let cmd_width ~csv ~chan () = - fprintf chan "%d\n" (columns csv) - -let cmd_height ~csv ~chan () = - fprintf chan "%d\n" (lines csv) - -let cmd_readable ~csv ~chan () = +(*------------------------------ start of code from extlib *) +exception Invalid_string + +let find str sub = + let sublen = String.length sub in + if sublen = 0 then + 0 + else + let found = ref 0 in + let len = String.length str in + try + for i = 0 to len - sublen do + let j = ref 0 in + while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do + incr j; + if !j = sublen then begin found := i; raise Exit; end; + done; + done; + raise Invalid_string + with + Exit -> !found + +let split str sep = + let p = find str sep in + let len = String.length sep in + let slen = String.length str in + String.sub str 0 p, String.sub str (p + len) (slen - p - len) + +let nsplit str sep = + if str = "" then [] + else ( + let rec nsplit str sep = + try + let s1 , s2 = split str sep in + s1 :: nsplit s2 sep + with + Invalid_string -> [str] + in + nsplit str sep + ) + +type 'a mut_list = { + hd: 'a; + mutable tl: 'a list +} +external inj : 'a mut_list -> 'a list = "%identity" + +let dummy_node () = { hd = Obj.magic (); tl = [] } + +let rec drop n = function + | _ :: l when n > 0 -> drop (n-1) l + | l -> l + +let take n l = + let rec loop n dst = function + | h :: t when n > 0 -> + let r = { hd = h; tl = [] } in + dst.tl <- inj r; + loop (n-1) r t + | _ -> + () + in + let dummy = dummy_node() in + loop n dummy l; + dummy.tl +(*------------------------------ end of extlib code *) + +(* Parse column specs. *) +type colspec = range list +and range = + | Col of int (* 0 *) + | Range of int * int (* 2-5 *) + | ToEnd of int (* 7- *) + +let parse_colspec ~count_zero colspec = + let cols = nsplit colspec "," in + let cols = List.map ( + fun col -> + try + (try + let first, second = split col "-" in + if second <> "" then + Range (int_of_string first, int_of_string second) + else + ToEnd (int_of_string first) + with + Invalid_string -> + Col (int_of_string col) + ) + with + Failure "int_of_string" -> + failwith (colspec ^ ":" ^ col ^ ": invalid column-spec") + ) cols in + + (* Adjust so columns always count from zero. *) + if not count_zero then + List.map ( + function + | Col c -> Col (c-1) + | Range (s, e) -> Range (s-1, e-1) + | ToEnd e -> ToEnd (e-1) + ) cols + else + cols + +let rec width_of_colspec = function + | [] -> 0 + | Col c :: rest -> 1 + width_of_colspec rest + | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest + | ToEnd _ :: _ -> + failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')" + +(* For closed column specs, this preserves the correct width in the + * result. + *) +let cols_of_colspec colspec row = + let rec loop = function + | [] -> [] + | Col c :: rest -> + (try List.nth row c + with Failure "nth" -> "") :: loop rest + | Range (s, e) :: rest -> + let width = e-s+1 in + let range = take width (drop s row) in + let range = List.hd (set_columns width [range]) in + List.append range (loop rest) + | ToEnd e :: rest -> + List.append (drop e row) (loop rest) + in + loop colspec + +(* The actual commands. *) +let cmd_cols ~input_sep ~output_sep ~chan colspec files = + List.iter ( + fun filename -> + let csv = load ~separator:input_sep filename in + let csv = List.map (cols_of_colspec colspec) csv in + save_out ~separator:output_sep chan csv + ) files + +let cmd_namedcols ~input_sep ~output_sep ~chan names files = + List.iter ( + fun filename -> + let csv = load ~separator:input_sep filename in + let header, data = + match csv with + | [] -> failwith "no rows in this CSV file" + | h :: t -> h, t in + let data = associate header data in + let data = List.map ( + fun row -> List.map (fun name -> List.assoc name row) names + ) data in + save_out ~separator:output_sep chan data + ) files + +let cmd_width ~input_sep ~chan files = + let width = List.fold_left ( + fun width filename -> + let csv = load ~separator:input_sep filename in + let width = max width (columns csv) in + width + ) 0 files in + fprintf chan "%d\n" width + +let cmd_height ~input_sep ~chan files = + let height = List.fold_left ( + fun height filename -> + let csv = load ~separator:input_sep filename in + let height = height + lines csv in + height + ) 0 files in + fprintf chan "%d\n" height + +let cmd_readable ~input_sep ~chan files = + let csv = List.concat (List.map (load ~separator:input_sep) files) in save_out_readable chan csv -let cmd_square ~separator ~csv ~chan () = +let cmd_cat ~input_sep ~output_sep ~chan files = + (* Avoid loading the whole file into memory. *) + let f row = + save_out ~separator:output_sep chan [row] + in + List.iter ( + fun filename -> + let in_chan = open_in filename in + load_rows ~separator:input_sep f in_chan; + close_in in_chan + ) files + +let cmd_square ~input_sep ~output_sep ~chan files = + let csv = List.concat (List.map (load ~separator:input_sep) files) in let csv = square csv in - save_out ~separator chan csv - -let cmd_sub ~separator ~csv ~chan args = - let r, c, rows, cols = - match args with - | [ r; c; rows; cols ] -> - int_of_string r, int_of_string c, - int_of_string rows, int_of_string cols - | _ -> - failwith "unknown arguments to 'sub' command" in + save_out ~separator:output_sep chan csv + +let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files = + let csv = List.concat (List.map (load ~separator:input_sep) files) in let csv = sub r c rows cols csv in - save_out ~separator chan csv + save_out ~separator:output_sep chan csv -let cmd_replace ~separator ~csv ~chan args = - let ncols, replacement = - match args with - | [ ncols; replacement ] -> int_of_string ncols, replacement - | _ -> - failwith "unknown arguments to 'replace' command" in - (* Load the replacement CSV file in. *) - let replacement = Csv.load ~separator replacement in - - (* Compare two rows for equality be considering only the first ncols. *) - let rec compare ncols row1 row2 = - if ncols <= 0 then true - else - match row1, row2 with - | [], [] -> true - | [], _ -> false - | _, [] -> false - | x :: xs, y :: ys -> - let c = Pervasives.compare x y in - if c <> 0 then false - else - compare (ncols-1) xs ys +let cmd_replace ~input_sep ~output_sep ~chan colspec update files = + let csv = List.concat (List.map (load ~separator:input_sep) files) in + + (* Load the update CSV file in. *) + let update = Csv.load ~separator:input_sep update in + + (* Compare two rows for equality by considering only the columns + * in colspec. + *) + let equal row1 row2 = + let row1 = cols_of_colspec colspec row1 in + let row2 = cols_of_colspec colspec row2 in + 0 = Csv.compare [row1] [row2] in (* Look for rows in the original to be replaced by rows from the - * replacement file. This is an ugly O(n^2) hack (XXX). + * update file. This is an ugly O(n^2) hack (XXX). *) let csv = List.filter ( - fun row -> not (List.exists (compare ncols row) replacement) + fun row -> not (List.exists (equal row) update) + ) csv in + let csv = csv @ update in + save_out ~separator:output_sep chan csv + +let rec uniq = function + | [] -> [] + | [x] -> [x] + | x :: y :: xs when Pervasives.compare x y = 0 -> + uniq (x :: xs) + | x :: y :: xs -> + x :: uniq (y :: xs) + +let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files = + (* Load in the files separately. *) + let csvs = List.map (load ~separator:input_sep) files in + + (* For each CSV file, construct a hash table from row class (key) to + * the (possibly empty) output columns (values). + * Also construct a hash which has the unique list of row classes. + *) + let keys = Hashtbl.create 1023 in + let hashes = List.map ( + fun csv -> + let hash = Hashtbl.create 1023 in + List.iter ( + fun row -> + let key = cols_of_colspec colspec1 row in + let value = cols_of_colspec colspec2 row in + if not (Hashtbl.mem keys key) then Hashtbl.add keys key true; + Hashtbl.add hash key value + ) csv; + hash + ) csvs in + + (* Get the keys. *) + let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in + + let value_width = width_of_colspec colspec2 in + let empty_value = + List.hd (set_columns value_width [[""]]) in + let multiple_values = + List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in + + (* Generate output CSV. *) + let keys = List.sort Pervasives.compare keys in + let keys = List.map (fun key -> key, []) keys in + let csv = List.fold_left ( + fun keys hash -> + List.map ( + fun (key, values) -> + let value = try Hashtbl.find_all hash key with Not_found -> [] in + let value = + match value with + | [] -> empty_value + | [value] -> value + | _::_ -> multiple_values in + key, (value :: values) + ) keys + ) keys hashes in + let csv = List.map ( + fun (key, values) -> + key @ List.flatten (List.rev values) ) csv in - let csv = csv @ replacement in - save_out ~separator chan csv + save_out ~separator:output_sep chan csv (* Process the arguments. *) let usage = @@ -97,27 +296,59 @@ let usage = csvtool is a tool for performing manipulations on CSV files from shell scripts. Summary: - csvtool [-options] command [command-args] < input.csv + csvtool [-options] command [command-args] input.csv [input2.csv [...]] Commands: - col [col1] [col2] ... + col Return one or more columns from the CSV file. Columns are numbered starting from zero. - namedcol [name1] [name2] ... + For , see below. + + Example: csvtool col 1-3,6 input.csv > output.csv + + namedcol Assuming the first row of the CSV file is a list of column headings, this returned the column(s) with the named headings. + is a comma-separated list of names. + + Example: csvtool namedcol Account,Cost input.csv > output.csv + width - Return the maximum width of the CSV file (number of columns in the + Print the maximum width of the CSV file (number of columns in the widest row). height - Return the number of rows in the CSV file. + Print the number of rows in the CSV file. readable Print the input CSV in a readable format. + cat + This concatenates the input files together and writes them to + the output. You can use this to change the separator character. + + Example: csvtool -t TAB -u , cat input.tsv > output.csv + + join + Join (collate) multiple CSV files together. + + controls which columns are compared. + + controls which columns are copied into the new file. + + Example: + csvtool join 1 2 coll1.csv coll2.csv + If coll1.csv contains: + Computers,$40 + Software,$100 + and coll2.csv contains: + Computers,$50 + then the output will be: + Computers,$40,$50 + Software,$100, + square Make the CSV square, so all rows have the same length. @@ -125,14 +356,41 @@ Commands: Take a square subset of the CSV, top left at row r, column c (counting from 0), which is rows deep and cols wide. - replace ncols file.csv - Replace rows in input.csv with rows from file.csv. The first ncols - columns only are used to compare rows in input.csv and file.csv to - see if they are candidates for replacement. + replace update.csv original.csv + Replace rows in original.csv with rows from update.csv. The columns + in only are used to compare rows in input.csv and + update.csv to see if they are candidates for replacement. + + Example: + csvtool replace 3 updates.csv original.csv > new.csv + mv new.csv original.csv + +Column specs: + A is a comma-separated list of column numbers + or column ranges. -Input and output files: - csvtool normally processes its input from stdin and writes its output - to stdout. Use the -i and -o options to override this behaviour. + Examples: + 1 Column 1 (the first, leftmost column) + 2,5,7 Columns 2, 5 and 7 + 1-3,5 Columns 1, 2, 3 and 5 + 1,5- Columns 1, 5 and up. + + Columns are numbered starting from 1 unless the -z option is given. + +Input files: + csvtool takes a list of input file(s) from the command line. + If none are listed, then stdin is used instead. + +Output file: + Normally the output is written to stdout. Use the -o option + to override this. + +Separators: + The default separator character is , (comma). To change this + on input or output see the -t and -u options respectively. + + Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated + files. Options:" @@ -141,6 +399,7 @@ let () = let set_input_sep = function | "TAB" -> input_sep := '\t' | "COMMA" -> input_sep := ',' + | "SPACE" -> input_sep := ' ' | s -> input_sep := s.[0] in @@ -148,21 +407,23 @@ let () = let set_output_sep = function | "TAB" -> output_sep := '\t' | "COMMA" -> output_sep := ',' + | "SPACE" -> output_sep := ' ' | s -> output_sep := s.[0] in - let input_file = ref "" in + let count_zero = ref false in + let output_file = ref "" in let argspec = [ "-t", Arg.String set_input_sep, "Input separator char. Use -t TAB for tab separated input."; "-u", Arg.String set_output_sep, - "Output separator char. Use -t TAB for tab separated output."; - "-i", Arg.Set_string input_file, - "Read CSV input from file (instead of stdin)"; + "Output separator char. Use -u TAB for tab separated output."; "-o", Arg.Set_string output_file, - "Write output to file (instead of stdout)" + "Write output to file (instead of stdout)"; + "-z", Arg.Set count_zero, + "Number columns from 0 instead of 1"; ] in let rest = ref [] in @@ -174,43 +435,48 @@ let () = let input_sep = !input_sep in let output_sep = !output_sep in - let input_file = !input_file in + let count_zero = !count_zero in let output_file = !output_file in let rest = List.rev !rest in - let cmd, args = - match rest with - | [] -> prerr_endline (Sys.executable_name ^ " --help for usage"); exit 1 - | h :: t -> h, t in - - (* Read the input file. *) - let input = - if input_file <> "" then load ~separator:input_sep input_file - else load_in ~separator:input_sep stdin in - (* Set up the output file. *) let chan = if output_file <> "" then open_out output_file else stdout in - (match cmd with - | "col" | "cols" -> - cmd_cols ~separator:output_sep ~csv:input ~chan args - | "namedcol" | "namedcols" -> - cmd_namedcols ~separator:output_sep ~csv:input ~chan args - | "width" -> - cmd_width ~csv:input ~chan () - | "height" -> - cmd_height ~csv:input ~chan () - | "readable" -> - cmd_readable ~csv:input ~chan () - | "square" -> - cmd_square ~separator:output_sep ~csv:input ~chan () - | "sub" -> - cmd_sub ~separator:output_sep ~csv:input ~chan args - | "replace" -> - cmd_replace ~separator:output_sep ~csv:input ~chan args - | _ -> prerr_endline (Sys.executable_name ^ " --help for usage") + (match rest with + | ("col"|"cols") :: colspec :: files -> + let colspec = parse_colspec ~count_zero colspec in + cmd_cols ~input_sep ~output_sep ~chan colspec files + | ("namedcol"|"namedcols") :: names :: files -> + let names = nsplit names "," in + cmd_namedcols ~input_sep ~output_sep ~chan names files + | "width" :: files -> + cmd_width ~input_sep ~chan files + | "height" :: files -> + cmd_height ~input_sep ~chan files + | "readable" :: files -> + cmd_readable ~input_sep ~chan files + | ("cat"|"concat") :: files -> + cmd_cat ~input_sep ~output_sep ~chan files + | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) -> + let colspec1 = parse_colspec ~count_zero colspec1 in + let colspec2 = parse_colspec ~count_zero colspec2 in + cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files + | "square" :: files -> + cmd_square ~input_sep ~output_sep ~chan files + | "sub" :: r :: c :: rows :: cols :: files -> + let r = int_of_string r in + let c = int_of_string c in + let rows = int_of_string rows in + let cols = int_of_string cols in + cmd_sub ~input_sep ~output_sep ~chan r c rows cols files + | "replace" :: colspec :: update :: files -> + let colspec = parse_colspec ~count_zero colspec in + cmd_replace ~input_sep ~output_sep ~chan colspec update files + | _ -> + prerr_endline (Sys.executable_name ^ " --help for usage"); + exit 2 ); if output_file <> "" then close_out chan