1 (* Handy tool for managing CSV files.
2 * $Id: csvtool.ml,v 1.9 2006-12-05 22:24:44 rich Exp $
8 (*------------------------------ start of code from extlib *)
9 exception Invalid_string
12 let sublen = String.length sub in
17 let len = String.length str in
19 for i = 0 to len - sublen do
21 while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do
23 if !j = sublen then begin found := i; raise Exit; end;
31 let p = find str sep in
32 let len = String.length sep in
33 let slen = String.length str in
34 String.sub str 0 p, String.sub str (p + len) (slen - p - len)
39 let rec nsplit str sep =
41 let s1 , s2 = split str sep in
44 Invalid_string -> [str]
53 external inj : 'a mut_list -> 'a list = "%identity"
55 let dummy_node () = { hd = Obj.magic (); tl = [] }
57 let rec drop n = function
58 | _ :: l when n > 0 -> drop (n-1) l
62 let rec loop n dst = function
63 | h :: t when n > 0 ->
64 let r = { hd = h; tl = [] } in
70 let dummy = dummy_node() in
73 (*------------------------------ end of extlib code *)
75 (* Parse column specs. *)
76 type colspec = range list
79 | Range of int * int (* 2-5 *)
80 | ToEnd of int (* 7- *)
82 let parse_colspec ~count_zero colspec =
83 let cols = nsplit colspec "," in
88 let first, second = split col "-" in
90 Range (int_of_string first, int_of_string second)
92 ToEnd (int_of_string first)
95 Col (int_of_string col)
98 Failure "int_of_string" ->
99 failwith (colspec ^ ":" ^ col ^ ": invalid column-spec")
102 (* Adjust so columns always count from zero. *)
103 if not count_zero then
107 | Range (s, e) -> Range (s-1, e-1)
108 | ToEnd e -> ToEnd (e-1)
113 let rec width_of_colspec = function
115 | Col c :: rest -> 1 + width_of_colspec rest
116 | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest
118 failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')"
120 (* For closed column specs, this preserves the correct width in the
123 let cols_of_colspec colspec row =
124 let rec loop = function
128 with Failure "nth" -> "") :: loop rest
129 | Range (s, e) :: rest ->
131 let range = take width (drop s row) in
132 let range = List.hd (set_columns width [range]) in
133 List.append range (loop rest)
135 List.append (drop e row) (loop rest)
139 (* The actual commands. *)
140 let cmd_cols ~input_sep ~output_sep ~chan colspec files =
143 let csv = load ~separator:input_sep filename in
144 let csv = List.map (cols_of_colspec colspec) csv in
145 save_out ~separator:output_sep chan csv
148 let cmd_namedcols ~input_sep ~output_sep ~chan names files =
151 let csv = load ~separator:input_sep filename in
154 | [] -> failwith "no rows in this CSV file"
156 (* Do the headers requested exist in the CSV file? If not,
161 if not (List.mem name header) then
162 failwith ("namedcol: requested header not in CSV file: " ^ name)
164 let data = associate header data in
165 let data = List.map (
166 fun row -> List.map (fun name -> List.assoc name row) names
168 save_out ~separator:output_sep chan data
171 let cmd_width ~input_sep ~chan files =
172 let width = List.fold_left (
173 fun width filename ->
174 let csv = load ~separator:input_sep filename in
175 let width = max width (columns csv) in
178 fprintf chan "%d\n" width
180 let cmd_height ~input_sep ~chan files =
181 let height = List.fold_left (
182 fun height filename ->
183 let csv = load ~separator:input_sep filename in
184 let height = height + lines csv in
187 fprintf chan "%d\n" height
189 let cmd_readable ~input_sep ~chan files =
190 let csv = List.concat (List.map (load ~separator:input_sep) files) in
191 save_out_readable chan csv
193 let cmd_cat ~input_sep ~output_sep ~chan files =
194 (* Avoid loading the whole file into memory. *)
196 save_out ~separator:output_sep chan [row]
202 | "-" -> stdin, false
203 | filename -> open_in filename, true in
204 load_rows ~separator:input_sep f in_chan;
205 if close then close_in in_chan
208 let cmd_set_columns ~input_sep ~output_sep ~chan cols files =
209 (* Avoid loading the whole file into memory. *)
212 let csv = set_columns cols csv in
213 save_out ~separator:output_sep chan csv
219 | "-" -> stdin, false
220 | filename -> open_in filename, true in
221 load_rows ~separator:input_sep f in_chan;
222 if close then close_in in_chan
225 let cmd_set_rows ~input_sep ~output_sep ~chan rows files =
226 let csv = List.concat (List.map (load ~separator:input_sep) files) in
227 let csv = set_rows rows csv in
228 save_out ~separator:output_sep chan csv
230 let cmd_head ~input_sep ~output_sep ~chan rows files =
231 (* Avoid loading the whole file into memory, or even loading
234 let nr_rows = ref rows in
236 if !nr_rows > 0 then (
238 save_out ~separator:output_sep chan [row]
243 if !nr_rows > 0 then (
246 | "-" -> stdin, false
247 | filename -> open_in filename, true in
248 load_rows ~separator:input_sep f in_chan;
249 if close then close_in in_chan
253 let cmd_drop ~input_sep ~output_sep ~chan rows files =
254 (* Avoid loading the whole file into memory. *)
255 let nr_rows = ref rows in
258 save_out ~separator:output_sep chan [row]
266 | "-" -> stdin, false
267 | filename -> open_in filename, true in
268 load_rows ~separator:input_sep f in_chan;
269 if close then close_in in_chan
272 let cmd_square ~input_sep ~output_sep ~chan files =
273 let csv = List.concat (List.map (load ~separator:input_sep) files) in
274 let csv = square csv in
275 save_out ~separator:output_sep chan csv
277 let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files =
278 let csv = List.concat (List.map (load ~separator:input_sep) files) in
279 let csv = sub r c rows cols csv in
280 save_out ~separator:output_sep chan csv
282 let cmd_replace ~input_sep ~output_sep ~chan colspec update files =
283 let csv = List.concat (List.map (load ~separator:input_sep) files) in
285 (* Load the update CSV file in. *)
286 let update = Csv.load ~separator:input_sep update in
288 (* Compare two rows for equality by considering only the columns
291 let equal row1 row2 =
292 let row1 = cols_of_colspec colspec row1 in
293 let row2 = cols_of_colspec colspec row2 in
294 0 = Csv.compare [row1] [row2]
297 (* Look for rows in the original to be replaced by rows from the
298 * update file. This is an ugly O(n^2) hack (XXX).
300 let csv = List.filter (
301 fun row -> not (List.exists (equal row) update)
303 let csv = csv @ update in
304 save_out ~separator:output_sep chan csv
306 let cmd_call ~input_sep ~output_sep ~chan command files =
307 (* Avoid loading the whole file into memory. *)
310 command ^ " " ^ String.concat " " (List.map Filename.quote row) in
311 let code = Sys.command cmd in
313 eprintf "%s: terminated with exit code %d\n" command code;
319 let in_chan = open_in filename in
320 load_rows ~separator:input_sep f in_chan;
324 let rec uniq = function
327 | x :: y :: xs when Pervasives.compare x y = 0 ->
332 let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files =
333 (* Load in the files separately. *)
334 let csvs = List.map (load ~separator:input_sep) files in
336 (* For each CSV file, construct a hash table from row class (key) to
337 * the (possibly empty) output columns (values).
338 * Also construct a hash which has the unique list of row classes.
340 let keys = Hashtbl.create 1023 in
341 let hashes = List.map (
343 let hash = Hashtbl.create 1023 in
346 let key = cols_of_colspec colspec1 row in
347 let value = cols_of_colspec colspec2 row in
348 if not (Hashtbl.mem keys key) then Hashtbl.add keys key true;
349 Hashtbl.add hash key value
355 let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in
357 let value_width = width_of_colspec colspec2 in
359 List.hd (set_columns value_width [[""]]) in
360 let multiple_values =
361 List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in
363 (* Generate output CSV. *)
364 let keys = List.sort Pervasives.compare keys in
365 let keys = List.map (fun key -> key, []) keys in
366 let csv = List.fold_left (
370 let value = try Hashtbl.find_all hash key with Not_found -> [] in
375 | _::_ -> multiple_values in
376 key, (value :: values)
381 key @ List.flatten (List.rev values)
383 save_out ~separator:output_sep chan csv
385 (* Process the arguments. *)
387 "csvtool - Copyright (C) 2005-2006 Richard W.M. Jones, Merjis Ltd.
389 csvtool is a tool for performing manipulations on CSV files from shell scripts.
392 csvtool [-options] command [command-args] input.csv [input2.csv [...]]
396 Return one or more columns from the CSV file.
398 For <column-spec>, see below.
400 Example: csvtool col 1-3,6 input.csv > output.csv
403 Assuming the first row of the CSV file is a list of column headings,
404 this returned the column(s) with the named headings.
406 <names> is a comma-separated list of names.
408 Example: csvtool namedcol Account,Cost input.csv > output.csv
411 Print the maximum width of the CSV file (number of columns in the
415 Print the number of rows in the CSV file.
417 For most CSV files this is equivalent to 'wc -l', but note that
418 some CSV files can contain a row which breaks over two (or more)
422 Set the number of columns to cols (this also makes the CSV file
423 square). Any short rows are padding with blank cells. Any
424 long rows are truncated.
427 'setrows n' sets the number of rows to 'n'. If there are fewer
428 than 'n' rows in the CSV files, then empty blank lines are added.
432 'head n' and 'take n' (which are synonyms) take the first 'n'
433 rows. If there are fewer than 'n' rows, padding is not added.
436 Drop the first 'rows' rows and return the rest (if any).
439 To remove the headings from a CSV file with headings:
440 csvtool drop 1 input.csv > output.csv
442 To extract rows 11 through 20 from a file:
443 csvtool drop 10 input.csv | csvtool take 10 - > output.csv
446 This concatenates the input files together and writes them to
447 the output. You can use this to change the separator character.
449 Example: csvtool -t TAB -u COMMA cat input.tsv > output.csv
451 join <column-spec1> <column-spec2>
452 Join (collate) multiple CSV files together.
454 <column-spec1> controls which columns are compared.
456 <column-spec2> controls which columns are copied into the new file.
459 csvtool join 1 2 coll1.csv coll2.csv > output.csv
461 In the above example, if coll1.csv contains:
464 and coll2.csv contains:
466 then the output will be:
471 Make the CSV square, so all rows have the same length.
473 Example: csvtool square input.csv > input-square.csv
476 Take a square subset of the CSV, top left at row r, column c, which
477 is rows deep and cols wide. 'r' and 'c' count from 1, or
478 from 0 if -z option is given.
480 replace <column-spec> update.csv original.csv
481 Replace rows in original.csv with rows from update.csv. The columns
482 in <column-spec> only are used to compare rows in input.csv and
483 update.csv to see if they are candidates for replacement.
486 csvtool replace 3 updates.csv original.csv > new.csv
487 mv new.csv original.csv
490 This calls the external command (or shell function) 'command'
491 followed by a parameter for each column in the CSV file. The
492 external command is called once for each row in the CSV file.
493 If any command returns a non-zero exit code then the whole
497 Use the shell command 'export -f funcname' to export
498 a shell function for use as a command. Within the
499 function, use the positional parameters $1, $2, ...
500 to refer to the columns.
502 Example (with a shell function):
508 csvtool call test my.csv
510 In the above example, if my.csv contains:
520 Print the input CSV in a readable format.
523 A <column-spec> is a comma-separated list of column numbers
527 1 Column 1 (the first, leftmost column)
528 2,5,7 Columns 2, 5 and 7
529 1-3,5 Columns 1, 2, 3 and 5
530 1,5- Columns 1, 5 and up.
532 Columns are numbered starting from 1 unless the -z option is given.
535 csvtool takes a list of input file(s) from the command line.
537 If an input filename is '-' then take input from stdin.
540 Normally the output is written to stdout. Use the -o option
544 The default separator character is , (comma). To change this
545 on input or output see the -t and -u options respectively.
547 Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated
553 let input_sep = ref ',' in
554 let set_input_sep = function
555 | "TAB" -> input_sep := '\t'
556 | "COMMA" -> input_sep := ','
557 | "SPACE" -> input_sep := ' '
558 | s -> input_sep := s.[0]
561 let output_sep = ref ',' in
562 let set_output_sep = function
563 | "TAB" -> output_sep := '\t'
564 | "COMMA" -> output_sep := ','
565 | "SPACE" -> output_sep := ' '
566 | s -> output_sep := s.[0]
569 let count_zero = ref false in
571 let output_file = ref "" in
579 "-t", Arg.String set_input_sep,
580 "Input separator char. Use -t TAB for tab separated input.";
581 "-u", Arg.String set_output_sep,
582 "Output separator char. Use -u TAB for tab separated output.";
583 "-o", Arg.Set_string output_file,
584 "Write output to file (instead of stdout)";
585 "-z", Arg.Set count_zero,
586 "Number columns from 0 instead of 1";
587 "-", Arg.Unit (fun () -> set_rest "-"),
588 "" (* Hack to allow '-' for input from stdin. *)
591 Arg.parse argspec set_rest usage;
593 let input_sep = !input_sep in
594 let output_sep = !output_sep in
595 let count_zero = !count_zero in
596 let output_file = !output_file in
597 let rest = List.rev !rest in
599 (* Set up the output file. *)
601 if output_file <> "" then open_out output_file
605 | ("col"|"cols") :: colspec :: files ->
606 let colspec = parse_colspec ~count_zero colspec in
607 cmd_cols ~input_sep ~output_sep ~chan colspec files
608 | ("namedcol"|"namedcols") :: names :: files ->
609 let names = nsplit names "," in
610 cmd_namedcols ~input_sep ~output_sep ~chan names files
611 | ("width"|"columns") :: files ->
612 cmd_width ~input_sep ~chan files
613 | ("height"|"rows") :: files ->
614 cmd_height ~input_sep ~chan files
615 | "readable" :: files ->
616 cmd_readable ~input_sep ~chan files
617 | ("cat"|"concat") :: files ->
618 cmd_cat ~input_sep ~output_sep ~chan files
619 | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) ->
620 let colspec1 = parse_colspec ~count_zero colspec1 in
621 let colspec2 = parse_colspec ~count_zero colspec2 in
622 cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files
623 | "square" :: files ->
624 cmd_square ~input_sep ~output_sep ~chan files
625 | "sub" :: r :: c :: rows :: cols :: files ->
626 let r = int_of_string r in
627 let r = if not count_zero then r-1 else r in
628 let c = int_of_string c in
629 let c = if not count_zero then c-1 else c in
630 let rows = int_of_string rows in
631 let cols = int_of_string cols in
632 cmd_sub ~input_sep ~output_sep ~chan r c rows cols files
633 | "replace" :: colspec :: update :: files ->
634 let colspec = parse_colspec ~count_zero colspec in
635 cmd_replace ~input_sep ~output_sep ~chan colspec update files
636 | ("setcolumns"|"set_columns"|"set-columns"|
637 "setcols"|"set_cols"|"set-cols") :: cols :: files ->
638 let cols = int_of_string cols in
639 cmd_set_columns ~input_sep ~output_sep ~chan cols files
640 | ("setrows"|"set_rows"|"set-rows") :: rows :: files ->
641 let rows = int_of_string rows in
642 cmd_set_rows ~input_sep ~output_sep ~chan rows files
643 | ("head"|"take") :: rows :: files ->
644 let rows = int_of_string rows in
645 cmd_head ~input_sep ~output_sep ~chan rows files
646 | "drop" :: rows :: files ->
647 let rows = int_of_string rows in
648 cmd_drop ~input_sep ~output_sep ~chan rows files
649 | "call" :: command :: files ->
650 cmd_call ~input_sep ~output_sep ~chan command files
652 prerr_endline (Sys.executable_name ^ " --help for usage");
656 if output_file <> "" then close_out chan