1 (* Handy tool for managing CSV files.
2 * $Id: csvtool.ml,v 1.8 2006-11-24 15:49:24 rich Exp $
8 (*------------------------------ start of code from extlib *)
9 exception Invalid_string
12 let sublen = String.length sub in
17 let len = String.length str in
19 for i = 0 to len - sublen do
21 while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do
23 if !j = sublen then begin found := i; raise Exit; end;
31 let p = find str sep in
32 let len = String.length sep in
33 let slen = String.length str in
34 String.sub str 0 p, String.sub str (p + len) (slen - p - len)
39 let rec nsplit str sep =
41 let s1 , s2 = split str sep in
44 Invalid_string -> [str]
53 external inj : 'a mut_list -> 'a list = "%identity"
55 let dummy_node () = { hd = Obj.magic (); tl = [] }
57 let rec drop n = function
58 | _ :: l when n > 0 -> drop (n-1) l
62 let rec loop n dst = function
63 | h :: t when n > 0 ->
64 let r = { hd = h; tl = [] } in
70 let dummy = dummy_node() in
73 (*------------------------------ end of extlib code *)
75 (* Parse column specs. *)
76 type colspec = range list
79 | Range of int * int (* 2-5 *)
80 | ToEnd of int (* 7- *)
82 let parse_colspec ~count_zero colspec =
83 let cols = nsplit colspec "," in
88 let first, second = split col "-" in
90 Range (int_of_string first, int_of_string second)
92 ToEnd (int_of_string first)
95 Col (int_of_string col)
98 Failure "int_of_string" ->
99 failwith (colspec ^ ":" ^ col ^ ": invalid column-spec")
102 (* Adjust so columns always count from zero. *)
103 if not count_zero then
107 | Range (s, e) -> Range (s-1, e-1)
108 | ToEnd e -> ToEnd (e-1)
113 let rec width_of_colspec = function
115 | Col c :: rest -> 1 + width_of_colspec rest
116 | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest
118 failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')"
120 (* For closed column specs, this preserves the correct width in the
123 let cols_of_colspec colspec row =
124 let rec loop = function
128 with Failure "nth" -> "") :: loop rest
129 | Range (s, e) :: rest ->
131 let range = take width (drop s row) in
132 let range = List.hd (set_columns width [range]) in
133 List.append range (loop rest)
135 List.append (drop e row) (loop rest)
139 (* The actual commands. *)
140 let cmd_cols ~input_sep ~output_sep ~chan colspec files =
143 let csv = load ~separator:input_sep filename in
144 let csv = List.map (cols_of_colspec colspec) csv in
145 save_out ~separator:output_sep chan csv
148 let cmd_namedcols ~input_sep ~output_sep ~chan names files =
151 let csv = load ~separator:input_sep filename in
154 | [] -> failwith "no rows in this CSV file"
156 (* Do the headers requested exist in the CSV file? If not,
161 if not (List.mem name header) then
162 failwith ("namedcol: requested header not in CSV file: " ^ name)
164 let data = associate header data in
165 let data = List.map (
166 fun row -> List.map (fun name -> List.assoc name row) names
168 save_out ~separator:output_sep chan data
171 let cmd_width ~input_sep ~chan files =
172 let width = List.fold_left (
173 fun width filename ->
174 let csv = load ~separator:input_sep filename in
175 let width = max width (columns csv) in
178 fprintf chan "%d\n" width
180 let cmd_height ~input_sep ~chan files =
181 let height = List.fold_left (
182 fun height filename ->
183 let csv = load ~separator:input_sep filename in
184 let height = height + lines csv in
187 fprintf chan "%d\n" height
189 let cmd_readable ~input_sep ~chan files =
190 let csv = List.concat (List.map (load ~separator:input_sep) files) in
191 save_out_readable chan csv
193 let cmd_cat ~input_sep ~output_sep ~chan files =
194 (* Avoid loading the whole file into memory. *)
196 save_out ~separator:output_sep chan [row]
202 | "-" -> stdin, false
203 | filename -> open_in filename, true in
204 load_rows ~separator:input_sep f in_chan;
205 if close then close_in in_chan
208 let cmd_set_columns ~input_sep ~output_sep ~chan cols files =
209 (* Avoid loading the whole file into memory. *)
212 let csv = set_columns cols csv in
213 save_out ~separator:output_sep chan csv
219 | "-" -> stdin, false
220 | filename -> open_in filename, true in
221 load_rows ~separator:input_sep f in_chan;
222 if close then close_in in_chan
225 let cmd_set_rows ~input_sep ~output_sep ~chan rows files =
226 let csv = List.concat (List.map (load ~separator:input_sep) files) in
227 let csv = set_rows rows csv in
228 save_out ~separator:output_sep chan csv
230 let cmd_head ~input_sep ~output_sep ~chan rows files =
231 (* Avoid loading the whole file into memory, or even loading
234 let nr_rows = ref rows in
236 if !nr_rows > 0 then (
238 save_out ~separator:output_sep chan [row]
243 if !nr_rows > 0 then (
246 | "-" -> stdin, false
247 | filename -> open_in filename, true in
248 load_rows ~separator:input_sep f in_chan;
249 if close then close_in in_chan
253 let cmd_drop ~input_sep ~output_sep ~chan rows files =
254 (* Avoid loading the whole file into memory. *)
255 let nr_rows = ref rows in
258 save_out ~separator:output_sep chan [row]
266 | "-" -> stdin, false
267 | filename -> open_in filename, true in
268 load_rows ~separator:input_sep f in_chan;
269 if close then close_in in_chan
272 let cmd_square ~input_sep ~output_sep ~chan files =
273 let csv = List.concat (List.map (load ~separator:input_sep) files) in
274 let csv = square csv in
275 save_out ~separator:output_sep chan csv
277 let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files =
278 let csv = List.concat (List.map (load ~separator:input_sep) files) in
279 let csv = sub r c rows cols csv in
280 save_out ~separator:output_sep chan csv
282 let cmd_replace ~input_sep ~output_sep ~chan colspec update files =
283 let csv = List.concat (List.map (load ~separator:input_sep) files) in
285 (* Load the update CSV file in. *)
286 let update = Csv.load ~separator:input_sep update in
288 (* Compare two rows for equality by considering only the columns
291 let equal row1 row2 =
292 let row1 = cols_of_colspec colspec row1 in
293 let row2 = cols_of_colspec colspec row2 in
294 0 = Csv.compare [row1] [row2]
297 (* Look for rows in the original to be replaced by rows from the
298 * update file. This is an ugly O(n^2) hack (XXX).
300 let csv = List.filter (
301 fun row -> not (List.exists (equal row) update)
303 let csv = csv @ update in
304 save_out ~separator:output_sep chan csv
306 let rec uniq = function
309 | x :: y :: xs when Pervasives.compare x y = 0 ->
314 let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files =
315 (* Load in the files separately. *)
316 let csvs = List.map (load ~separator:input_sep) files in
318 (* For each CSV file, construct a hash table from row class (key) to
319 * the (possibly empty) output columns (values).
320 * Also construct a hash which has the unique list of row classes.
322 let keys = Hashtbl.create 1023 in
323 let hashes = List.map (
325 let hash = Hashtbl.create 1023 in
328 let key = cols_of_colspec colspec1 row in
329 let value = cols_of_colspec colspec2 row in
330 if not (Hashtbl.mem keys key) then Hashtbl.add keys key true;
331 Hashtbl.add hash key value
337 let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in
339 let value_width = width_of_colspec colspec2 in
341 List.hd (set_columns value_width [[""]]) in
342 let multiple_values =
343 List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in
345 (* Generate output CSV. *)
346 let keys = List.sort Pervasives.compare keys in
347 let keys = List.map (fun key -> key, []) keys in
348 let csv = List.fold_left (
352 let value = try Hashtbl.find_all hash key with Not_found -> [] in
357 | _::_ -> multiple_values in
358 key, (value :: values)
363 key @ List.flatten (List.rev values)
365 save_out ~separator:output_sep chan csv
367 (* Process the arguments. *)
369 "csvtool - Copyright (C) 2005-2006 Richard W.M. Jones, Merjis Ltd.
371 csvtool is a tool for performing manipulations on CSV files from shell scripts.
374 csvtool [-options] command [command-args] input.csv [input2.csv [...]]
378 Return one or more columns from the CSV file.
380 For <column-spec>, see below.
382 Example: csvtool col 1-3,6 input.csv > output.csv
385 Assuming the first row of the CSV file is a list of column headings,
386 this returned the column(s) with the named headings.
388 <names> is a comma-separated list of names.
390 Example: csvtool namedcol Account,Cost input.csv > output.csv
393 Print the maximum width of the CSV file (number of columns in the
397 Print the number of rows in the CSV file.
399 For most CSV files this is equivalent to 'wc -l', but note that
400 some CSV files can contain a row which breaks over two (or more)
404 Set the number of columns to cols (this also makes the CSV file
405 square). Any short rows are padding with blank cells. Any
406 long rows are truncated.
409 'setrows n' sets the number of rows to 'n'. If there are fewer
410 than 'n' rows in the CSV files, then empty blank lines are added.
414 'head n' and 'take n' (which are synonyms) take the first 'n'
415 rows. If there are fewer than 'n' rows, padding is not added.
418 Drop the first 'rows' rows and return the rest (if any).
421 To remove the headings from a CSV file with headings:
422 csvtool drop 1 input.csv > output.csv
424 To extract rows 11 through 20 from a file:
425 csvtool drop 10 input.csv | csvtool take 10 - > output.csv
428 This concatenates the input files together and writes them to
429 the output. You can use this to change the separator character.
431 Example: csvtool -t TAB -u COMMA cat input.tsv > output.csv
433 join <column-spec1> <column-spec2>
434 Join (collate) multiple CSV files together.
436 <column-spec1> controls which columns are compared.
438 <column-spec2> controls which columns are copied into the new file.
441 csvtool join 1 2 coll1.csv coll2.csv > output.csv
442 If coll1.csv contains:
445 and coll2.csv contains:
447 then the output will be:
452 Make the CSV square, so all rows have the same length.
454 Example: csvtool square input.csv > input-square.csv
457 Take a square subset of the CSV, top left at row r, column c, which
458 is rows deep and cols wide. 'r' and 'c' count from 1, or
459 from 0 if -z option is given.
461 replace <column-spec> update.csv original.csv
462 Replace rows in original.csv with rows from update.csv. The columns
463 in <column-spec> only are used to compare rows in input.csv and
464 update.csv to see if they are candidates for replacement.
467 csvtool replace 3 updates.csv original.csv > new.csv
468 mv new.csv original.csv
471 Print the input CSV in a readable format.
474 A <column-spec> is a comma-separated list of column numbers
478 1 Column 1 (the first, leftmost column)
479 2,5,7 Columns 2, 5 and 7
480 1-3,5 Columns 1, 2, 3 and 5
481 1,5- Columns 1, 5 and up.
483 Columns are numbered starting from 1 unless the -z option is given.
486 csvtool takes a list of input file(s) from the command line.
488 If an input filename is '-' then take input from stdin.
491 Normally the output is written to stdout. Use the -o option
495 The default separator character is , (comma). To change this
496 on input or output see the -t and -u options respectively.
498 Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated
504 let input_sep = ref ',' in
505 let set_input_sep = function
506 | "TAB" -> input_sep := '\t'
507 | "COMMA" -> input_sep := ','
508 | "SPACE" -> input_sep := ' '
509 | s -> input_sep := s.[0]
512 let output_sep = ref ',' in
513 let set_output_sep = function
514 | "TAB" -> output_sep := '\t'
515 | "COMMA" -> output_sep := ','
516 | "SPACE" -> output_sep := ' '
517 | s -> output_sep := s.[0]
520 let count_zero = ref false in
522 let output_file = ref "" in
530 "-t", Arg.String set_input_sep,
531 "Input separator char. Use -t TAB for tab separated input.";
532 "-u", Arg.String set_output_sep,
533 "Output separator char. Use -u TAB for tab separated output.";
534 "-o", Arg.Set_string output_file,
535 "Write output to file (instead of stdout)";
536 "-z", Arg.Set count_zero,
537 "Number columns from 0 instead of 1";
538 "-", Arg.Unit (fun () -> set_rest "-"),
539 "" (* Hack to allow '-' for input from stdin. *)
542 Arg.parse argspec set_rest usage;
544 let input_sep = !input_sep in
545 let output_sep = !output_sep in
546 let count_zero = !count_zero in
547 let output_file = !output_file in
548 let rest = List.rev !rest in
550 (* Set up the output file. *)
552 if output_file <> "" then open_out output_file
556 | ("col"|"cols") :: colspec :: files ->
557 let colspec = parse_colspec ~count_zero colspec in
558 cmd_cols ~input_sep ~output_sep ~chan colspec files
559 | ("namedcol"|"namedcols") :: names :: files ->
560 let names = nsplit names "," in
561 cmd_namedcols ~input_sep ~output_sep ~chan names files
562 | ("width"|"columns") :: files ->
563 cmd_width ~input_sep ~chan files
564 | ("height"|"rows") :: files ->
565 cmd_height ~input_sep ~chan files
566 | "readable" :: files ->
567 cmd_readable ~input_sep ~chan files
568 | ("cat"|"concat") :: files ->
569 cmd_cat ~input_sep ~output_sep ~chan files
570 | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) ->
571 let colspec1 = parse_colspec ~count_zero colspec1 in
572 let colspec2 = parse_colspec ~count_zero colspec2 in
573 cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files
574 | "square" :: files ->
575 cmd_square ~input_sep ~output_sep ~chan files
576 | "sub" :: r :: c :: rows :: cols :: files ->
577 let r = int_of_string r in
578 let r = if not count_zero then r-1 else r in
579 let c = int_of_string c in
580 let c = if not count_zero then c-1 else c in
581 let rows = int_of_string rows in
582 let cols = int_of_string cols in
583 cmd_sub ~input_sep ~output_sep ~chan r c rows cols files
584 | "replace" :: colspec :: update :: files ->
585 let colspec = parse_colspec ~count_zero colspec in
586 cmd_replace ~input_sep ~output_sep ~chan colspec update files
587 | ("setcolumns"|"set_columns"|"set-columns"|
588 "setcols"|"set_cols"|"set-cols") :: cols :: files ->
589 let cols = int_of_string cols in
590 cmd_set_columns ~input_sep ~output_sep ~chan cols files
591 | ("setrows"|"set_rows"|"set-rows") :: rows :: files ->
592 let rows = int_of_string rows in
593 cmd_set_rows ~input_sep ~output_sep ~chan rows files
594 | ("head"|"take") :: rows :: files ->
595 let rows = int_of_string rows in
596 cmd_head ~input_sep ~output_sep ~chan rows files
597 | "drop" :: rows :: files ->
598 let rows = int_of_string rows in
599 cmd_drop ~input_sep ~output_sep ~chan rows files
601 prerr_endline (Sys.executable_name ^ " --help for usage");
605 if output_file <> "" then close_out chan