(* Handy tool for managing CSV files.
- * $Id: csvtool.ml,v 1.3 2006-06-06 16:01:52 rich Exp $
+ * $Id: csvtool.ml,v 1.4 2006-10-24 10:06:01 rich Exp $
*)
open Printf
open Csv
-let cmd_cols ~separator ~csv ~chan cols =
- let cols = List.map int_of_string cols in
-
- let output = List.map (
- fun row ->
- let n = List.length row in
- let row = List.map (
- fun col_wanted ->
- if 0 <= col_wanted && col_wanted < n then
- List.nth row col_wanted
- else
- ""
- ) cols in
- row
- ) csv in
- save_out ~separator chan output
-
-let cmd_namedcols ~separator ~csv ~chan names =
- let header, data =
- match csv with
- | [] -> failwith "no rows in this CSV file"
- | h :: t -> h, t in
- let data = associate header data in
- let data = List.map (
- fun row -> List.map (fun name -> List.assoc name row) names
- ) data in
- save_out ~separator chan data
-
-let cmd_width ~csv ~chan () =
- fprintf chan "%d\n" (columns csv)
-
-let cmd_height ~csv ~chan () =
- fprintf chan "%d\n" (lines csv)
-
-let cmd_readable ~csv ~chan () =
+(*------------------------------ start of code from extlib *)
+exception Invalid_string
+
+let find str sub =
+ let sublen = String.length sub in
+ if sublen = 0 then
+ 0
+ else
+ let found = ref 0 in
+ let len = String.length str in
+ try
+ for i = 0 to len - sublen do
+ let j = ref 0 in
+ while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do
+ incr j;
+ if !j = sublen then begin found := i; raise Exit; end;
+ done;
+ done;
+ raise Invalid_string
+ with
+ Exit -> !found
+
+let split str sep =
+ let p = find str sep in
+ let len = String.length sep in
+ let slen = String.length str in
+ String.sub str 0 p, String.sub str (p + len) (slen - p - len)
+
+let nsplit str sep =
+ if str = "" then []
+ else (
+ let rec nsplit str sep =
+ try
+ let s1 , s2 = split str sep in
+ s1 :: nsplit s2 sep
+ with
+ Invalid_string -> [str]
+ in
+ nsplit str sep
+ )
+
+type 'a mut_list = {
+ hd: 'a;
+ mutable tl: 'a list
+}
+external inj : 'a mut_list -> 'a list = "%identity"
+
+let dummy_node () = { hd = Obj.magic (); tl = [] }
+
+let rec drop n = function
+ | _ :: l when n > 0 -> drop (n-1) l
+ | l -> l
+
+let take n l =
+ let rec loop n dst = function
+ | h :: t when n > 0 ->
+ let r = { hd = h; tl = [] } in
+ dst.tl <- inj r;
+ loop (n-1) r t
+ | _ ->
+ ()
+ in
+ let dummy = dummy_node() in
+ loop n dummy l;
+ dummy.tl
+(*------------------------------ end of extlib code *)
+
+(* Parse column specs. *)
+type colspec = range list
+and range =
+ | Col of int (* 0 *)
+ | Range of int * int (* 2-5 *)
+ | ToEnd of int (* 7- *)
+
+let parse_colspec ~count_zero colspec =
+ let cols = nsplit colspec "," in
+ let cols = List.map (
+ fun col ->
+ try
+ (try
+ let first, second = split col "-" in
+ if second <> "" then
+ Range (int_of_string first, int_of_string second)
+ else
+ ToEnd (int_of_string first)
+ with
+ Invalid_string ->
+ Col (int_of_string col)
+ )
+ with
+ Failure "int_of_string" ->
+ failwith (colspec ^ ":" ^ col ^ ": invalid column-spec")
+ ) cols in
+
+ (* Adjust so columns always count from zero. *)
+ if not count_zero then
+ List.map (
+ function
+ | Col c -> Col (c-1)
+ | Range (s, e) -> Range (s-1, e-1)
+ | ToEnd e -> ToEnd (e-1)
+ ) cols
+ else
+ cols
+
+let rec width_of_colspec = function
+ | [] -> 0
+ | Col c :: rest -> 1 + width_of_colspec rest
+ | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest
+ | ToEnd _ :: _ ->
+ failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')"
+
+(* For closed column specs, this preserves the correct width in the
+ * result.
+ *)
+let cols_of_colspec colspec row =
+ let rec loop = function
+ | [] -> []
+ | Col c :: rest ->
+ (try List.nth row c
+ with Failure "nth" -> "") :: loop rest
+ | Range (s, e) :: rest ->
+ let width = e-s+1 in
+ let range = take width (drop s row) in
+ let range = List.hd (set_columns width [range]) in
+ List.append range (loop rest)
+ | ToEnd e :: rest ->
+ List.append (drop e row) (loop rest)
+ in
+ loop colspec
+
+(* The actual commands. *)
+let cmd_cols ~input_sep ~output_sep ~chan colspec files =
+ List.iter (
+ fun filename ->
+ let csv = load ~separator:input_sep filename in
+ let csv = List.map (cols_of_colspec colspec) csv in
+ save_out ~separator:output_sep chan csv
+ ) files
+
+let cmd_namedcols ~input_sep ~output_sep ~chan names files =
+ List.iter (
+ fun filename ->
+ let csv = load ~separator:input_sep filename in
+ let header, data =
+ match csv with
+ | [] -> failwith "no rows in this CSV file"
+ | h :: t -> h, t in
+ let data = associate header data in
+ let data = List.map (
+ fun row -> List.map (fun name -> List.assoc name row) names
+ ) data in
+ save_out ~separator:output_sep chan data
+ ) files
+
+let cmd_width ~input_sep ~chan files =
+ let width = List.fold_left (
+ fun width filename ->
+ let csv = load ~separator:input_sep filename in
+ let width = max width (columns csv) in
+ width
+ ) 0 files in
+ fprintf chan "%d\n" width
+
+let cmd_height ~input_sep ~chan files =
+ let height = List.fold_left (
+ fun height filename ->
+ let csv = load ~separator:input_sep filename in
+ let height = height + lines csv in
+ height
+ ) 0 files in
+ fprintf chan "%d\n" height
+
+let cmd_readable ~input_sep ~chan files =
+ let csv = List.concat (List.map (load ~separator:input_sep) files) in
save_out_readable chan csv
-let cmd_square ~separator ~csv ~chan () =
+let cmd_cat ~input_sep ~output_sep ~chan files =
+ (* Avoid loading the whole file into memory. *)
+ let f row =
+ save_out ~separator:output_sep chan [row]
+ in
+ List.iter (
+ fun filename ->
+ let in_chan = open_in filename in
+ load_rows ~separator:input_sep f in_chan;
+ close_in in_chan
+ ) files
+
+let cmd_square ~input_sep ~output_sep ~chan files =
+ let csv = List.concat (List.map (load ~separator:input_sep) files) in
let csv = square csv in
- save_out ~separator chan csv
-
-let cmd_sub ~separator ~csv ~chan args =
- let r, c, rows, cols =
- match args with
- | [ r; c; rows; cols ] ->
- int_of_string r, int_of_string c,
- int_of_string rows, int_of_string cols
- | _ ->
- failwith "unknown arguments to 'sub' command" in
+ save_out ~separator:output_sep chan csv
+
+let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files =
+ let csv = List.concat (List.map (load ~separator:input_sep) files) in
let csv = sub r c rows cols csv in
- save_out ~separator chan csv
+ save_out ~separator:output_sep chan csv
-let cmd_replace ~separator ~csv ~chan args =
- let ncols, replacement =
- match args with
- | [ ncols; replacement ] -> int_of_string ncols, replacement
- | _ ->
- failwith "unknown arguments to 'replace' command" in
- (* Load the replacement CSV file in. *)
- let replacement = Csv.load ~separator replacement in
-
- (* Compare two rows for equality be considering only the first ncols. *)
- let rec compare ncols row1 row2 =
- if ncols <= 0 then true
- else
- match row1, row2 with
- | [], [] -> true
- | [], _ -> false
- | _, [] -> false
- | x :: xs, y :: ys ->
- let c = Pervasives.compare x y in
- if c <> 0 then false
- else
- compare (ncols-1) xs ys
+let cmd_replace ~input_sep ~output_sep ~chan colspec update files =
+ let csv = List.concat (List.map (load ~separator:input_sep) files) in
+
+ (* Load the update CSV file in. *)
+ let update = Csv.load ~separator:input_sep update in
+
+ (* Compare two rows for equality by considering only the columns
+ * in colspec.
+ *)
+ let equal row1 row2 =
+ let row1 = cols_of_colspec colspec row1 in
+ let row2 = cols_of_colspec colspec row2 in
+ 0 = Csv.compare [row1] [row2]
in
(* Look for rows in the original to be replaced by rows from the
- * replacement file. This is an ugly O(n^2) hack (XXX).
+ * update file. This is an ugly O(n^2) hack (XXX).
*)
let csv = List.filter (
- fun row -> not (List.exists (compare ncols row) replacement)
+ fun row -> not (List.exists (equal row) update)
+ ) csv in
+ let csv = csv @ update in
+ save_out ~separator:output_sep chan csv
+
+let rec uniq = function
+ | [] -> []
+ | [x] -> [x]
+ | x :: y :: xs when Pervasives.compare x y = 0 ->
+ uniq (x :: xs)
+ | x :: y :: xs ->
+ x :: uniq (y :: xs)
+
+let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files =
+ (* Load in the files separately. *)
+ let csvs = List.map (load ~separator:input_sep) files in
+
+ (* For each CSV file, construct a hash table from row class (key) to
+ * the (possibly empty) output columns (values).
+ * Also construct a hash which has the unique list of row classes.
+ *)
+ let keys = Hashtbl.create 1023 in
+ let hashes = List.map (
+ fun csv ->
+ let hash = Hashtbl.create 1023 in
+ List.iter (
+ fun row ->
+ let key = cols_of_colspec colspec1 row in
+ let value = cols_of_colspec colspec2 row in
+ if not (Hashtbl.mem keys key) then Hashtbl.add keys key true;
+ Hashtbl.add hash key value
+ ) csv;
+ hash
+ ) csvs in
+
+ (* Get the keys. *)
+ let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in
+
+ let value_width = width_of_colspec colspec2 in
+ let empty_value =
+ List.hd (set_columns value_width [[""]]) in
+ let multiple_values =
+ List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in
+
+ (* Generate output CSV. *)
+ let keys = List.sort Pervasives.compare keys in
+ let keys = List.map (fun key -> key, []) keys in
+ let csv = List.fold_left (
+ fun keys hash ->
+ List.map (
+ fun (key, values) ->
+ let value = try Hashtbl.find_all hash key with Not_found -> [] in
+ let value =
+ match value with
+ | [] -> empty_value
+ | [value] -> value
+ | _::_ -> multiple_values in
+ key, (value :: values)
+ ) keys
+ ) keys hashes in
+ let csv = List.map (
+ fun (key, values) ->
+ key @ List.flatten (List.rev values)
) csv in
- let csv = csv @ replacement in
- save_out ~separator chan csv
+ save_out ~separator:output_sep chan csv
(* Process the arguments. *)
let usage =
csvtool is a tool for performing manipulations on CSV files from shell scripts.
Summary:
- csvtool [-options] command [command-args] < input.csv
+ csvtool [-options] command [command-args] input.csv [input2.csv [...]]
Commands:
- col [col1] [col2] ...
+ col <column-spec>
Return one or more columns from the CSV file. Columns are numbered
starting from zero.
- namedcol [name1] [name2] ...
+ For <column-spec>, see below.
+
+ Example: csvtool col 1-3,6 input.csv > output.csv
+
+ namedcol <names>
Assuming the first row of the CSV file is a list of column headings,
this returned the column(s) with the named headings.
+ <names> is a comma-separated list of names.
+
+ Example: csvtool namedcol Account,Cost input.csv > output.csv
+
width
- Return the maximum width of the CSV file (number of columns in the
+ Print the maximum width of the CSV file (number of columns in the
widest row).
height
- Return the number of rows in the CSV file.
+ Print the number of rows in the CSV file.
readable
Print the input CSV in a readable format.
+ cat
+ This concatenates the input files together and writes them to
+ the output. You can use this to change the separator character.
+
+ Example: csvtool -t TAB -u , cat input.tsv > output.csv
+
+ join <column-spec1> <column-spec2>
+ Join (collate) multiple CSV files together.
+
+ <column-spec1> controls which columns are compared.
+
+ <column-spec2> controls which columns are copied into the new file.
+
+ Example:
+ csvtool join 1 2 coll1.csv coll2.csv
+ If coll1.csv contains:
+ Computers,$40
+ Software,$100
+ and coll2.csv contains:
+ Computers,$50
+ then the output will be:
+ Computers,$40,$50
+ Software,$100,
+
square
Make the CSV square, so all rows have the same length.
Take a square subset of the CSV, top left at row r, column c (counting
from 0), which is rows deep and cols wide.
- replace ncols file.csv
- Replace rows in input.csv with rows from file.csv. The first ncols
- columns only are used to compare rows in input.csv and file.csv to
- see if they are candidates for replacement.
+ replace <column-spec> update.csv original.csv
+ Replace rows in original.csv with rows from update.csv. The columns
+ in <column-spec> only are used to compare rows in input.csv and
+ update.csv to see if they are candidates for replacement.
+
+ Example:
+ csvtool replace 3 updates.csv original.csv > new.csv
+ mv new.csv original.csv
+
+Column specs:
+ A <column-spec> is a comma-separated list of column numbers
+ or column ranges.
-Input and output files:
- csvtool normally processes its input from stdin and writes its output
- to stdout. Use the -i and -o options to override this behaviour.
+ Examples:
+ 1 Column 1 (the first, leftmost column)
+ 2,5,7 Columns 2, 5 and 7
+ 1-3,5 Columns 1, 2, 3 and 5
+ 1,5- Columns 1, 5 and up.
+
+ Columns are numbered starting from 1 unless the -z option is given.
+
+Input files:
+ csvtool takes a list of input file(s) from the command line.
+ If none are listed, then stdin is used instead.
+
+Output file:
+ Normally the output is written to stdout. Use the -o option
+ to override this.
+
+Separators:
+ The default separator character is , (comma). To change this
+ on input or output see the -t and -u options respectively.
+
+ Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated
+ files.
Options:"
let set_input_sep = function
| "TAB" -> input_sep := '\t'
| "COMMA" -> input_sep := ','
+ | "SPACE" -> input_sep := ' '
| s -> input_sep := s.[0]
in
let set_output_sep = function
| "TAB" -> output_sep := '\t'
| "COMMA" -> output_sep := ','
+ | "SPACE" -> output_sep := ' '
| s -> output_sep := s.[0]
in
- let input_file = ref "" in
+ let count_zero = ref false in
+
let output_file = ref "" in
let argspec = [
"-t", Arg.String set_input_sep,
"Input separator char. Use -t TAB for tab separated input.";
"-u", Arg.String set_output_sep,
- "Output separator char. Use -t TAB for tab separated output.";
- "-i", Arg.Set_string input_file,
- "Read CSV input from file (instead of stdin)";
+ "Output separator char. Use -u TAB for tab separated output.";
"-o", Arg.Set_string output_file,
- "Write output to file (instead of stdout)"
+ "Write output to file (instead of stdout)";
+ "-z", Arg.Set count_zero,
+ "Number columns from 0 instead of 1";
] in
let rest = ref [] in
let input_sep = !input_sep in
let output_sep = !output_sep in
- let input_file = !input_file in
+ let count_zero = !count_zero in
let output_file = !output_file in
let rest = List.rev !rest in
- let cmd, args =
- match rest with
- | [] -> prerr_endline (Sys.executable_name ^ " --help for usage"); exit 1
- | h :: t -> h, t in
-
- (* Read the input file. *)
- let input =
- if input_file <> "" then load ~separator:input_sep input_file
- else load_in ~separator:input_sep stdin in
-
(* Set up the output file. *)
let chan =
if output_file <> "" then open_out output_file
else stdout in
- (match cmd with
- | "col" | "cols" ->
- cmd_cols ~separator:output_sep ~csv:input ~chan args
- | "namedcol" | "namedcols" ->
- cmd_namedcols ~separator:output_sep ~csv:input ~chan args
- | "width" ->
- cmd_width ~csv:input ~chan ()
- | "height" ->
- cmd_height ~csv:input ~chan ()
- | "readable" ->
- cmd_readable ~csv:input ~chan ()
- | "square" ->
- cmd_square ~separator:output_sep ~csv:input ~chan ()
- | "sub" ->
- cmd_sub ~separator:output_sep ~csv:input ~chan args
- | "replace" ->
- cmd_replace ~separator:output_sep ~csv:input ~chan args
- | _ -> prerr_endline (Sys.executable_name ^ " --help for usage")
+ (match rest with
+ | ("col"|"cols") :: colspec :: files ->
+ let colspec = parse_colspec ~count_zero colspec in
+ cmd_cols ~input_sep ~output_sep ~chan colspec files
+ | ("namedcol"|"namedcols") :: names :: files ->
+ let names = nsplit names "," in
+ cmd_namedcols ~input_sep ~output_sep ~chan names files
+ | "width" :: files ->
+ cmd_width ~input_sep ~chan files
+ | "height" :: files ->
+ cmd_height ~input_sep ~chan files
+ | "readable" :: files ->
+ cmd_readable ~input_sep ~chan files
+ | ("cat"|"concat") :: files ->
+ cmd_cat ~input_sep ~output_sep ~chan files
+ | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) ->
+ let colspec1 = parse_colspec ~count_zero colspec1 in
+ let colspec2 = parse_colspec ~count_zero colspec2 in
+ cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files
+ | "square" :: files ->
+ cmd_square ~input_sep ~output_sep ~chan files
+ | "sub" :: r :: c :: rows :: cols :: files ->
+ let r = int_of_string r in
+ let c = int_of_string c in
+ let rows = int_of_string rows in
+ let cols = int_of_string cols in
+ cmd_sub ~input_sep ~output_sep ~chan r c rows cols files
+ | "replace" :: colspec :: update :: files ->
+ let colspec = parse_colspec ~count_zero colspec in
+ cmd_replace ~input_sep ~output_sep ~chan colspec update files
+ | _ ->
+ prerr_endline (Sys.executable_name ^ " --help for usage");
+ exit 2
);
if output_file <> "" then close_out chan