1 (* Handy tool for managing CSV files.
2 * $Id: csvtool.ml,v 1.7 2006-11-24 13:58:56 rich Exp $
8 (*------------------------------ start of code from extlib *)
9 exception Invalid_string
12 let sublen = String.length sub in
17 let len = String.length str in
19 for i = 0 to len - sublen do
21 while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do
23 if !j = sublen then begin found := i; raise Exit; end;
31 let p = find str sep in
32 let len = String.length sep in
33 let slen = String.length str in
34 String.sub str 0 p, String.sub str (p + len) (slen - p - len)
39 let rec nsplit str sep =
41 let s1 , s2 = split str sep in
44 Invalid_string -> [str]
53 external inj : 'a mut_list -> 'a list = "%identity"
55 let dummy_node () = { hd = Obj.magic (); tl = [] }
57 let rec drop n = function
58 | _ :: l when n > 0 -> drop (n-1) l
62 let rec loop n dst = function
63 | h :: t when n > 0 ->
64 let r = { hd = h; tl = [] } in
70 let dummy = dummy_node() in
73 (*------------------------------ end of extlib code *)
75 (* Parse column specs. *)
76 type colspec = range list
79 | Range of int * int (* 2-5 *)
80 | ToEnd of int (* 7- *)
82 let parse_colspec ~count_zero colspec =
83 let cols = nsplit colspec "," in
88 let first, second = split col "-" in
90 Range (int_of_string first, int_of_string second)
92 ToEnd (int_of_string first)
95 Col (int_of_string col)
98 Failure "int_of_string" ->
99 failwith (colspec ^ ":" ^ col ^ ": invalid column-spec")
102 (* Adjust so columns always count from zero. *)
103 if not count_zero then
107 | Range (s, e) -> Range (s-1, e-1)
108 | ToEnd e -> ToEnd (e-1)
113 let rec width_of_colspec = function
115 | Col c :: rest -> 1 + width_of_colspec rest
116 | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest
118 failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')"
120 (* For closed column specs, this preserves the correct width in the
123 let cols_of_colspec colspec row =
124 let rec loop = function
128 with Failure "nth" -> "") :: loop rest
129 | Range (s, e) :: rest ->
131 let range = take width (drop s row) in
132 let range = List.hd (set_columns width [range]) in
133 List.append range (loop rest)
135 List.append (drop e row) (loop rest)
139 (* The actual commands. *)
140 let cmd_cols ~input_sep ~output_sep ~chan colspec files =
143 let csv = load ~separator:input_sep filename in
144 let csv = List.map (cols_of_colspec colspec) csv in
145 save_out ~separator:output_sep chan csv
148 let cmd_namedcols ~input_sep ~output_sep ~chan names files =
151 let csv = load ~separator:input_sep filename in
154 | [] -> failwith "no rows in this CSV file"
156 (* Do the headers requested exist in the CSV file? If not,
161 if not (List.mem name header) then
162 failwith ("namedcol: requested header not in CSV file: " ^ name)
164 let data = associate header data in
165 let data = List.map (
166 fun row -> List.map (fun name -> List.assoc name row) names
168 save_out ~separator:output_sep chan data
171 let cmd_width ~input_sep ~chan files =
172 let width = List.fold_left (
173 fun width filename ->
174 let csv = load ~separator:input_sep filename in
175 let width = max width (columns csv) in
178 fprintf chan "%d\n" width
180 let cmd_height ~input_sep ~chan files =
181 let height = List.fold_left (
182 fun height filename ->
183 let csv = load ~separator:input_sep filename in
184 let height = height + lines csv in
187 fprintf chan "%d\n" height
189 let cmd_readable ~input_sep ~chan files =
190 let csv = List.concat (List.map (load ~separator:input_sep) files) in
191 save_out_readable chan csv
193 let cmd_cat ~input_sep ~output_sep ~chan files =
194 (* Avoid loading the whole file into memory. *)
196 save_out ~separator:output_sep chan [row]
200 let in_chan = open_in filename in
201 load_rows ~separator:input_sep f in_chan;
205 let cmd_square ~input_sep ~output_sep ~chan files =
206 let csv = List.concat (List.map (load ~separator:input_sep) files) in
207 let csv = square csv in
208 save_out ~separator:output_sep chan csv
210 let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files =
211 let csv = List.concat (List.map (load ~separator:input_sep) files) in
212 let csv = sub r c rows cols csv in
213 save_out ~separator:output_sep chan csv
215 let cmd_replace ~input_sep ~output_sep ~chan colspec update files =
216 let csv = List.concat (List.map (load ~separator:input_sep) files) in
218 (* Load the update CSV file in. *)
219 let update = Csv.load ~separator:input_sep update in
221 (* Compare two rows for equality by considering only the columns
224 let equal row1 row2 =
225 let row1 = cols_of_colspec colspec row1 in
226 let row2 = cols_of_colspec colspec row2 in
227 0 = Csv.compare [row1] [row2]
230 (* Look for rows in the original to be replaced by rows from the
231 * update file. This is an ugly O(n^2) hack (XXX).
233 let csv = List.filter (
234 fun row -> not (List.exists (equal row) update)
236 let csv = csv @ update in
237 save_out ~separator:output_sep chan csv
239 let rec uniq = function
242 | x :: y :: xs when Pervasives.compare x y = 0 ->
247 let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files =
248 (* Load in the files separately. *)
249 let csvs = List.map (load ~separator:input_sep) files in
251 (* For each CSV file, construct a hash table from row class (key) to
252 * the (possibly empty) output columns (values).
253 * Also construct a hash which has the unique list of row classes.
255 let keys = Hashtbl.create 1023 in
256 let hashes = List.map (
258 let hash = Hashtbl.create 1023 in
261 let key = cols_of_colspec colspec1 row in
262 let value = cols_of_colspec colspec2 row in
263 if not (Hashtbl.mem keys key) then Hashtbl.add keys key true;
264 Hashtbl.add hash key value
270 let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in
272 let value_width = width_of_colspec colspec2 in
274 List.hd (set_columns value_width [[""]]) in
275 let multiple_values =
276 List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in
278 (* Generate output CSV. *)
279 let keys = List.sort Pervasives.compare keys in
280 let keys = List.map (fun key -> key, []) keys in
281 let csv = List.fold_left (
285 let value = try Hashtbl.find_all hash key with Not_found -> [] in
290 | _::_ -> multiple_values in
291 key, (value :: values)
296 key @ List.flatten (List.rev values)
298 save_out ~separator:output_sep chan csv
300 (* Process the arguments. *)
302 "csvtool - Copyright (C) 2005-2006 Richard W.M. Jones, Merjis Ltd.
304 csvtool is a tool for performing manipulations on CSV files from shell scripts.
307 csvtool [-options] command [command-args] input.csv [input2.csv [...]]
311 Return one or more columns from the CSV file.
313 For <column-spec>, see below.
315 Example: csvtool col 1-3,6 input.csv > output.csv
318 Assuming the first row of the CSV file is a list of column headings,
319 this returned the column(s) with the named headings.
321 <names> is a comma-separated list of names.
323 Example: csvtool namedcol Account,Cost input.csv > output.csv
326 Print the maximum width of the CSV file (number of columns in the
330 Print the number of rows in the CSV file.
333 Print the input CSV in a readable format.
336 This concatenates the input files together and writes them to
337 the output. You can use this to change the separator character.
339 Example: csvtool -t TAB -u , cat input.tsv > output.csv
341 join <column-spec1> <column-spec2>
342 Join (collate) multiple CSV files together.
344 <column-spec1> controls which columns are compared.
346 <column-spec2> controls which columns are copied into the new file.
349 csvtool join 1 2 coll1.csv coll2.csv
350 If coll1.csv contains:
353 and coll2.csv contains:
355 then the output will be:
360 Make the CSV square, so all rows have the same length.
363 Take a square subset of the CSV, top left at row r, column c (counting
364 from 0), which is rows deep and cols wide.
366 replace <column-spec> update.csv original.csv
367 Replace rows in original.csv with rows from update.csv. The columns
368 in <column-spec> only are used to compare rows in input.csv and
369 update.csv to see if they are candidates for replacement.
372 csvtool replace 3 updates.csv original.csv > new.csv
373 mv new.csv original.csv
376 A <column-spec> is a comma-separated list of column numbers
380 1 Column 1 (the first, leftmost column)
381 2,5,7 Columns 2, 5 and 7
382 1-3,5 Columns 1, 2, 3 and 5
383 1,5- Columns 1, 5 and up.
385 Columns are numbered starting from 1 unless the -z option is given.
388 csvtool takes a list of input file(s) from the command line.
391 Normally the output is written to stdout. Use the -o option
395 The default separator character is , (comma). To change this
396 on input or output see the -t and -u options respectively.
398 Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated
404 let input_sep = ref ',' in
405 let set_input_sep = function
406 | "TAB" -> input_sep := '\t'
407 | "COMMA" -> input_sep := ','
408 | "SPACE" -> input_sep := ' '
409 | s -> input_sep := s.[0]
412 let output_sep = ref ',' in
413 let set_output_sep = function
414 | "TAB" -> output_sep := '\t'
415 | "COMMA" -> output_sep := ','
416 | "SPACE" -> output_sep := ' '
417 | s -> output_sep := s.[0]
420 let count_zero = ref false in
422 let output_file = ref "" in
425 "-t", Arg.String set_input_sep,
426 "Input separator char. Use -t TAB for tab separated input.";
427 "-u", Arg.String set_output_sep,
428 "Output separator char. Use -u TAB for tab separated output.";
429 "-o", Arg.Set_string output_file,
430 "Write output to file (instead of stdout)";
431 "-z", Arg.Set count_zero,
432 "Number columns from 0 instead of 1";
440 Arg.parse argspec set_rest usage;
442 let input_sep = !input_sep in
443 let output_sep = !output_sep in
444 let count_zero = !count_zero in
445 let output_file = !output_file in
446 let rest = List.rev !rest in
448 (* Set up the output file. *)
450 if output_file <> "" then open_out output_file
454 | ("col"|"cols") :: colspec :: files ->
455 let colspec = parse_colspec ~count_zero colspec in
456 cmd_cols ~input_sep ~output_sep ~chan colspec files
457 | ("namedcol"|"namedcols") :: names :: files ->
458 let names = nsplit names "," in
459 cmd_namedcols ~input_sep ~output_sep ~chan names files
460 | "width" :: files ->
461 cmd_width ~input_sep ~chan files
462 | "height" :: files ->
463 cmd_height ~input_sep ~chan files
464 | "readable" :: files ->
465 cmd_readable ~input_sep ~chan files
466 | ("cat"|"concat") :: files ->
467 cmd_cat ~input_sep ~output_sep ~chan files
468 | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) ->
469 let colspec1 = parse_colspec ~count_zero colspec1 in
470 let colspec2 = parse_colspec ~count_zero colspec2 in
471 cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files
472 | "square" :: files ->
473 cmd_square ~input_sep ~output_sep ~chan files
474 | "sub" :: r :: c :: rows :: cols :: files ->
475 let r = int_of_string r in
476 let c = int_of_string c in
477 let rows = int_of_string rows in
478 let cols = int_of_string cols in
479 cmd_sub ~input_sep ~output_sep ~chan r c rows cols files
480 | "replace" :: colspec :: update :: files ->
481 let colspec = parse_colspec ~count_zero colspec in
482 cmd_replace ~input_sep ~output_sep ~chan colspec update files
484 prerr_endline (Sys.executable_name ^ " --help for usage");
488 if output_file <> "" then close_out chan