1 (* Handy tool for managing CSV files.
2 * $Id: csvtool.ml,v 1.11 2008-10-27 21:57:48 rich Exp $
8 (*------------------------------ start of code from extlib *)
9 exception Invalid_string
12 let sublen = String.length sub in
17 let len = String.length str in
19 for i = 0 to len - sublen do
21 while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do
23 if !j = sublen then begin found := i; raise Exit; end;
31 let p = find str sep in
32 let len = String.length sep in
33 let slen = String.length str in
34 String.sub str 0 p, String.sub str (p + len) (slen - p - len)
39 let rec nsplit str sep =
41 let s1 , s2 = split str sep in
44 Invalid_string -> [str]
53 external inj : 'a mut_list -> 'a list = "%identity"
55 let dummy_node () = { hd = Obj.magic (); tl = [] }
57 let rec drop n = function
58 | _ :: l when n > 0 -> drop (n-1) l
62 let rec loop n dst = function
63 | h :: t when n > 0 ->
64 let r = { hd = h; tl = [] } in
70 let dummy = dummy_node() in
73 (*------------------------------ end of extlib code *)
75 (* Parse column specs. *)
76 type colspec = range list
79 | Range of int * int (* 2-5 *)
80 | ToEnd of int (* 7- *)
82 let parse_colspec ~count_zero colspec =
83 let cols = nsplit colspec "," in
88 let first, second = split col "-" in
90 Range (int_of_string first, int_of_string second)
92 ToEnd (int_of_string first)
95 Col (int_of_string col)
98 Failure "int_of_string" ->
99 failwith (colspec ^ ":" ^ col ^ ": invalid column-spec")
102 (* Adjust so columns always count from zero. *)
103 if not count_zero then
107 | Range (s, e) -> Range (s-1, e-1)
108 | ToEnd e -> ToEnd (e-1)
113 let rec width_of_colspec = function
115 | Col c :: rest -> 1 + width_of_colspec rest
116 | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest
118 failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')"
120 (* For closed column specs, this preserves the correct width in the
123 let cols_of_colspec colspec row =
124 let rec loop = function
128 with Failure "nth" -> "") :: loop rest
129 | Range (s, e) :: rest ->
131 let range = take width (drop s row) in
132 let range = List.hd (set_columns width [range]) in
133 List.append range (loop rest)
135 List.append (drop e row) (loop rest)
139 (* The actual commands. *)
140 let cmd_cols ~input_sep ~output_sep ~chan colspec files =
143 let csv = load ~separator:input_sep filename in
144 let csv = List.map (cols_of_colspec colspec) csv in
145 save_out ~separator:output_sep chan csv
148 let cmd_namedcols ~input_sep ~output_sep ~chan names files =
151 let csv = load ~separator:input_sep filename in
154 | [] -> failwith "no rows in this CSV file"
156 (* Do the headers requested exist in the CSV file? If not,
161 if not (List.mem name header) then
162 failwith ("namedcol: requested header not in CSV file: " ^ name)
164 let data = associate header data in
165 let data = List.map (
166 fun row -> List.map (fun name -> List.assoc name row) names
168 save_out ~separator:output_sep chan data
171 let cmd_width ~input_sep ~chan files =
172 let width = List.fold_left (
173 fun width filename ->
174 let csv = load ~separator:input_sep filename in
175 let width = max width (columns csv) in
178 fprintf chan "%d\n" width
180 let cmd_height ~input_sep ~chan files =
181 let height = List.fold_left (
182 fun height filename ->
183 let csv = load ~separator:input_sep filename in
184 let height = height + lines csv in
187 fprintf chan "%d\n" height
189 let cmd_readable ~input_sep ~chan files =
190 let csv = List.concat (List.map (load ~separator:input_sep) files) in
191 save_out_readable chan csv
193 let cmd_cat ~input_sep ~output_sep ~chan files =
194 (* Avoid loading the whole file into memory. *)
196 save_out ~separator:output_sep chan [row]
202 | "-" -> stdin, false
203 | filename -> open_in filename, true in
204 load_rows ~separator:input_sep f in_chan;
205 if close then close_in in_chan
208 let cmd_set_columns ~input_sep ~output_sep ~chan cols files =
209 (* Avoid loading the whole file into memory. *)
212 let csv = set_columns cols csv in
213 save_out ~separator:output_sep chan csv
219 | "-" -> stdin, false
220 | filename -> open_in filename, true in
221 load_rows ~separator:input_sep f in_chan;
222 if close then close_in in_chan
225 let cmd_set_rows ~input_sep ~output_sep ~chan rows files =
226 let csv = List.concat (List.map (load ~separator:input_sep) files) in
227 let csv = set_rows rows csv in
228 save_out ~separator:output_sep chan csv
230 let cmd_head ~input_sep ~output_sep ~chan rows files =
231 (* Avoid loading the whole file into memory, or even loading
234 let nr_rows = ref rows in
236 if !nr_rows > 0 then (
238 save_out ~separator:output_sep chan [row]
243 if !nr_rows > 0 then (
246 | "-" -> stdin, false
247 | filename -> open_in filename, true in
248 load_rows ~separator:input_sep f in_chan;
249 if close then close_in in_chan
253 let cmd_drop ~input_sep ~output_sep ~chan rows files =
254 (* Avoid loading the whole file into memory. *)
255 let nr_rows = ref rows in
258 save_out ~separator:output_sep chan [row]
266 | "-" -> stdin, false
267 | filename -> open_in filename, true in
268 load_rows ~separator:input_sep f in_chan;
269 if close then close_in in_chan
272 let cmd_square ~input_sep ~output_sep ~chan files =
273 let csv = List.concat (List.map (load ~separator:input_sep) files) in
274 let csv = square csv in
275 save_out ~separator:output_sep chan csv
277 let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files =
278 let csv = List.concat (List.map (load ~separator:input_sep) files) in
279 let csv = sub r c rows cols csv in
280 save_out ~separator:output_sep chan csv
282 let cmd_replace ~input_sep ~output_sep ~chan colspec update files =
283 let csv = List.concat (List.map (load ~separator:input_sep) files) in
285 (* Load the update CSV file in. *)
286 let update = Csv.load ~separator:input_sep update in
288 (* Compare two rows for equality by considering only the columns
291 let equal row1 row2 =
292 let row1 = cols_of_colspec colspec row1 in
293 let row2 = cols_of_colspec colspec row2 in
294 0 = Csv.compare [row1] [row2]
297 (* Look for rows in the original to be replaced by rows from the
298 * update file. This is an ugly O(n^2) hack (XXX).
300 let csv = List.filter (
301 fun row -> not (List.exists (equal row) update)
303 let csv = csv @ update in
304 save_out ~separator:output_sep chan csv
306 let cmd_call ~input_sep ~output_sep ~chan command files =
307 (* Avoid loading the whole file into memory. *)
310 command ^ " " ^ String.concat " " (List.map Filename.quote row) in
311 let code = Sys.command cmd in
313 eprintf "%s: terminated with exit code %d\n" command code;
321 | "-" -> stdin, false
322 | filename -> open_in filename, true in
323 load_rows ~separator:input_sep f in_chan;
324 if close then close_in in_chan
327 let rec uniq = function
330 | x :: y :: xs when Pervasives.compare x y = 0 ->
335 let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files =
336 (* Load in the files separately. *)
337 let csvs = List.map (load ~separator:input_sep) files in
339 (* For each CSV file, construct a hash table from row class (key) to
340 * the (possibly empty) output columns (values).
341 * Also construct a hash which has the unique list of row classes.
343 let keys = Hashtbl.create 1023 in
344 let hashes = List.map (
346 let hash = Hashtbl.create 1023 in
349 let key = cols_of_colspec colspec1 row in
350 let value = cols_of_colspec colspec2 row in
351 if not (Hashtbl.mem keys key) then Hashtbl.add keys key true;
352 Hashtbl.add hash key value
358 let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in
360 let value_width = width_of_colspec colspec2 in
362 List.hd (set_columns value_width [[""]]) in
363 let multiple_values =
364 List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in
366 (* Generate output CSV. *)
367 let keys = List.sort Pervasives.compare keys in
368 let keys = List.map (fun key -> key, []) keys in
369 let csv = List.fold_left (
373 let value = try Hashtbl.find_all hash key with Not_found -> [] in
378 | _::_ -> multiple_values in
379 key, (value :: values)
384 key @ List.flatten (List.rev values)
386 save_out ~separator:output_sep chan csv
388 let rec cmd_trim ~input_sep ~output_sep ~chan (top, left, right, bottom) files =
389 let csv = List.concat (List.map (load ~separator:input_sep) files) in
390 let csv = trim ~top ~left ~right ~bottom csv in
391 save_out ~separator:output_sep chan csv
393 and trim_flags flags =
395 try ignore (String.index flags c); true with Not_found -> false
398 let left = set 'l' in
399 let right = set 'r' in
400 let bottom = set 'b' in
401 (top, left, right, bottom)
403 (* Process the arguments. *)
405 "csvtool - Copyright (C) 2005-2006 Richard W.M. Jones, Merjis Ltd.
407 csvtool is a tool for performing manipulations on CSV files from shell scripts.
410 csvtool [-options] command [command-args] input.csv [input2.csv [...]]
414 Return one or more columns from the CSV file.
416 For <column-spec>, see below.
418 Example: csvtool col 1-3,6 input.csv > output.csv
421 Assuming the first row of the CSV file is a list of column headings,
422 this returned the column(s) with the named headings.
424 <names> is a comma-separated list of names.
426 Example: csvtool namedcol Account,Cost input.csv > output.csv
429 Print the maximum width of the CSV file (number of columns in the
433 Print the number of rows in the CSV file.
435 For most CSV files this is equivalent to 'wc -l', but note that
436 some CSV files can contain a row which breaks over two (or more)
440 Set the number of columns to cols (this also makes the CSV file
441 square). Any short rows are padding with blank cells. Any
442 long rows are truncated.
445 'setrows n' sets the number of rows to 'n'. If there are fewer
446 than 'n' rows in the CSV files, then empty blank lines are added.
450 'head n' and 'take n' (which are synonyms) take the first 'n'
451 rows. If there are fewer than 'n' rows, padding is not added.
454 Drop the first 'rows' rows and return the rest (if any).
457 To remove the headings from a CSV file with headings:
458 csvtool drop 1 input.csv > output.csv
460 To extract rows 11 through 20 from a file:
461 csvtool drop 10 input.csv | csvtool take 10 - > output.csv
464 This concatenates the input files together and writes them to
465 the output. You can use this to change the separator character.
467 Example: csvtool -t TAB -u COMMA cat input.tsv > output.csv
469 join <column-spec1> <column-spec2>
470 Join (collate) multiple CSV files together.
472 <column-spec1> controls which columns are compared.
474 <column-spec2> controls which columns are copied into the new file.
477 csvtool join 1 2 coll1.csv coll2.csv > output.csv
479 In the above example, if coll1.csv contains:
482 and coll2.csv contains:
484 then the output will be:
489 Make the CSV square, so all rows have the same length.
491 Example: csvtool square input.csv > input-square.csv
494 Trim empty cells at the top/left/right/bottom of the CSV file.
497 csvtool trim t input.csv # trims empty rows at the top only
498 csvtool trim tb input.csv # trims empty rows at the top & bottom
499 csvtool trim lr input.csv # trims empty columns at left & right
500 csvtool trim tlrb input.csv # trims empty rows/columns all around
503 Take a square subset of the CSV, top left at row r, column c, which
504 is rows deep and cols wide. 'r' and 'c' count from 1, or
505 from 0 if -z option is given.
507 replace <column-spec> update.csv original.csv
508 Replace rows in original.csv with rows from update.csv. The columns
509 in <column-spec> only are used to compare rows in input.csv and
510 update.csv to see if they are candidates for replacement.
513 csvtool replace 3 updates.csv original.csv > new.csv
514 mv new.csv original.csv
517 This calls the external command (or shell function) 'command'
518 followed by a parameter for each column in the CSV file. The
519 external command is called once for each row in the CSV file.
520 If any command returns a non-zero exit code then the whole
524 Use the shell command 'export -f funcname' to export
525 a shell function for use as a command. Within the
526 function, use the positional parameters $1, $2, ...
527 to refer to the columns.
529 Example (with a shell function):
535 csvtool call test my.csv
537 In the above example, if my.csv contains:
547 Print the input CSV in a readable format.
550 A <column-spec> is a comma-separated list of column numbers
554 1 Column 1 (the first, leftmost column)
555 2,5,7 Columns 2, 5 and 7
556 1-3,5 Columns 1, 2, 3 and 5
557 1,5- Columns 1, 5 and up.
559 Columns are numbered starting from 1 unless the -z option is given.
562 csvtool takes a list of input file(s) from the command line.
564 If an input filename is '-' then take input from stdin.
567 Normally the output is written to stdout. Use the -o option
571 The default separator character is , (comma). To change this
572 on input or output see the -t and -u options respectively.
574 Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated
580 let input_sep = ref ',' in
581 let set_input_sep = function
582 | "TAB" -> input_sep := '\t'
583 | "COMMA" -> input_sep := ','
584 | "SPACE" -> input_sep := ' '
585 | s -> input_sep := s.[0]
588 let output_sep = ref ',' in
589 let set_output_sep = function
590 | "TAB" -> output_sep := '\t'
591 | "COMMA" -> output_sep := ','
592 | "SPACE" -> output_sep := ' '
593 | s -> output_sep := s.[0]
596 let count_zero = ref false in
598 let output_file = ref "" in
606 "-t", Arg.String set_input_sep,
607 "Input separator char. Use -t TAB for tab separated input.";
608 "-u", Arg.String set_output_sep,
609 "Output separator char. Use -u TAB for tab separated output.";
610 "-o", Arg.Set_string output_file,
611 "Write output to file (instead of stdout)";
612 "-z", Arg.Set count_zero,
613 "Number columns from 0 instead of 1";
614 "-", Arg.Unit (fun () -> set_rest "-"),
615 "" (* Hack to allow '-' for input from stdin. *)
618 Arg.parse argspec set_rest usage;
620 let input_sep = !input_sep in
621 let output_sep = !output_sep in
622 let count_zero = !count_zero in
623 let output_file = !output_file in
624 let rest = List.rev !rest in
626 (* Set up the output file. *)
628 if output_file <> "" then open_out output_file
632 | ("col"|"cols") :: colspec :: files ->
633 let colspec = parse_colspec ~count_zero colspec in
634 cmd_cols ~input_sep ~output_sep ~chan colspec files
635 | ("namedcol"|"namedcols") :: names :: files ->
636 let names = nsplit names "," in
637 cmd_namedcols ~input_sep ~output_sep ~chan names files
638 | ("width"|"columns") :: files ->
639 cmd_width ~input_sep ~chan files
640 | ("height"|"rows") :: files ->
641 cmd_height ~input_sep ~chan files
642 | "readable" :: files ->
643 cmd_readable ~input_sep ~chan files
644 | ("cat"|"concat") :: files ->
645 cmd_cat ~input_sep ~output_sep ~chan files
646 | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) ->
647 let colspec1 = parse_colspec ~count_zero colspec1 in
648 let colspec2 = parse_colspec ~count_zero colspec2 in
649 cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files
650 | "square" :: files ->
651 cmd_square ~input_sep ~output_sep ~chan files
652 | "sub" :: r :: c :: rows :: cols :: files ->
653 let r = int_of_string r in
654 let r = if not count_zero then r-1 else r in
655 let c = int_of_string c in
656 let c = if not count_zero then c-1 else c in
657 let rows = int_of_string rows in
658 let cols = int_of_string cols in
659 cmd_sub ~input_sep ~output_sep ~chan r c rows cols files
660 | "replace" :: colspec :: update :: files ->
661 let colspec = parse_colspec ~count_zero colspec in
662 cmd_replace ~input_sep ~output_sep ~chan colspec update files
663 | ("setcolumns"|"set_columns"|"set-columns"|
664 "setcols"|"set_cols"|"set-cols") :: cols :: files ->
665 let cols = int_of_string cols in
666 cmd_set_columns ~input_sep ~output_sep ~chan cols files
667 | ("setrows"|"set_rows"|"set-rows") :: rows :: files ->
668 let rows = int_of_string rows in
669 cmd_set_rows ~input_sep ~output_sep ~chan rows files
670 | ("head"|"take") :: rows :: files ->
671 let rows = int_of_string rows in
672 cmd_head ~input_sep ~output_sep ~chan rows files
673 | "drop" :: rows :: files ->
674 let rows = int_of_string rows in
675 cmd_drop ~input_sep ~output_sep ~chan rows files
676 | "call" :: command :: files ->
677 cmd_call ~input_sep ~output_sep ~chan command files
678 | "trim" :: flags :: files ->
679 let flags = trim_flags flags in
680 cmd_trim ~input_sep ~output_sep ~chan flags files
682 prerr_endline (Sys.executable_name ^ " --help for usage");
686 if output_file <> "" then close_out chan