1 (* Handy tool for managing CSV files.
2 * $Id: csvtool.ml,v 1.6 2006-11-24 10:09:56 rich Exp $
8 (*------------------------------ start of code from extlib *)
9 exception Invalid_string
12 let sublen = String.length sub in
17 let len = String.length str in
19 for i = 0 to len - sublen do
21 while String.unsafe_get str (i + !j) = String.unsafe_get sub !j do
23 if !j = sublen then begin found := i; raise Exit; end;
31 let p = find str sep in
32 let len = String.length sep in
33 let slen = String.length str in
34 String.sub str 0 p, String.sub str (p + len) (slen - p - len)
39 let rec nsplit str sep =
41 let s1 , s2 = split str sep in
44 Invalid_string -> [str]
53 external inj : 'a mut_list -> 'a list = "%identity"
55 let dummy_node () = { hd = Obj.magic (); tl = [] }
57 let rec drop n = function
58 | _ :: l when n > 0 -> drop (n-1) l
62 let rec loop n dst = function
63 | h :: t when n > 0 ->
64 let r = { hd = h; tl = [] } in
70 let dummy = dummy_node() in
73 (*------------------------------ end of extlib code *)
75 (* Parse column specs. *)
76 type colspec = range list
79 | Range of int * int (* 2-5 *)
80 | ToEnd of int (* 7- *)
82 let parse_colspec ~count_zero colspec =
83 let cols = nsplit colspec "," in
88 let first, second = split col "-" in
90 Range (int_of_string first, int_of_string second)
92 ToEnd (int_of_string first)
95 Col (int_of_string col)
98 Failure "int_of_string" ->
99 failwith (colspec ^ ":" ^ col ^ ": invalid column-spec")
102 (* Adjust so columns always count from zero. *)
103 if not count_zero then
107 | Range (s, e) -> Range (s-1, e-1)
108 | ToEnd e -> ToEnd (e-1)
113 let rec width_of_colspec = function
115 | Col c :: rest -> 1 + width_of_colspec rest
116 | Range (s, e) :: rest -> (e-s+1) + width_of_colspec rest
118 failwith "width_of_colspec: cannot calculate width of an open column spec (one which contains 'N-')"
120 (* For closed column specs, this preserves the correct width in the
123 let cols_of_colspec colspec row =
124 let rec loop = function
128 with Failure "nth" -> "") :: loop rest
129 | Range (s, e) :: rest ->
131 let range = take width (drop s row) in
132 let range = List.hd (set_columns width [range]) in
133 List.append range (loop rest)
135 List.append (drop e row) (loop rest)
139 (* The actual commands. *)
140 let cmd_cols ~input_sep ~output_sep ~chan colspec files =
143 let csv = load ~separator:input_sep filename in
144 let csv = List.map (cols_of_colspec colspec) csv in
145 save_out ~separator:output_sep chan csv
148 let cmd_namedcols ~input_sep ~output_sep ~chan names files =
151 let csv = load ~separator:input_sep filename in
154 | [] -> failwith "no rows in this CSV file"
156 let data = associate header data in
157 let data = List.map (
158 fun row -> List.map (fun name -> List.assoc name row) names
160 save_out ~separator:output_sep chan data
163 let cmd_width ~input_sep ~chan files =
164 let width = List.fold_left (
165 fun width filename ->
166 let csv = load ~separator:input_sep filename in
167 let width = max width (columns csv) in
170 fprintf chan "%d\n" width
172 let cmd_height ~input_sep ~chan files =
173 let height = List.fold_left (
174 fun height filename ->
175 let csv = load ~separator:input_sep filename in
176 let height = height + lines csv in
179 fprintf chan "%d\n" height
181 let cmd_readable ~input_sep ~chan files =
182 let csv = List.concat (List.map (load ~separator:input_sep) files) in
183 save_out_readable chan csv
185 let cmd_cat ~input_sep ~output_sep ~chan files =
186 (* Avoid loading the whole file into memory. *)
188 save_out ~separator:output_sep chan [row]
192 let in_chan = open_in filename in
193 load_rows ~separator:input_sep f in_chan;
197 let cmd_square ~input_sep ~output_sep ~chan files =
198 let csv = List.concat (List.map (load ~separator:input_sep) files) in
199 let csv = square csv in
200 save_out ~separator:output_sep chan csv
202 let cmd_sub ~input_sep ~output_sep ~chan r c rows cols files =
203 let csv = List.concat (List.map (load ~separator:input_sep) files) in
204 let csv = sub r c rows cols csv in
205 save_out ~separator:output_sep chan csv
207 let cmd_replace ~input_sep ~output_sep ~chan colspec update files =
208 let csv = List.concat (List.map (load ~separator:input_sep) files) in
210 (* Load the update CSV file in. *)
211 let update = Csv.load ~separator:input_sep update in
213 (* Compare two rows for equality by considering only the columns
216 let equal row1 row2 =
217 let row1 = cols_of_colspec colspec row1 in
218 let row2 = cols_of_colspec colspec row2 in
219 0 = Csv.compare [row1] [row2]
222 (* Look for rows in the original to be replaced by rows from the
223 * update file. This is an ugly O(n^2) hack (XXX).
225 let csv = List.filter (
226 fun row -> not (List.exists (equal row) update)
228 let csv = csv @ update in
229 save_out ~separator:output_sep chan csv
231 let rec uniq = function
234 | x :: y :: xs when Pervasives.compare x y = 0 ->
239 let cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files =
240 (* Load in the files separately. *)
241 let csvs = List.map (load ~separator:input_sep) files in
243 (* For each CSV file, construct a hash table from row class (key) to
244 * the (possibly empty) output columns (values).
245 * Also construct a hash which has the unique list of row classes.
247 let keys = Hashtbl.create 1023 in
248 let hashes = List.map (
250 let hash = Hashtbl.create 1023 in
253 let key = cols_of_colspec colspec1 row in
254 let value = cols_of_colspec colspec2 row in
255 if not (Hashtbl.mem keys key) then Hashtbl.add keys key true;
256 Hashtbl.add hash key value
262 let keys = Hashtbl.fold (fun key _ xs -> key :: xs) keys [] in
264 let value_width = width_of_colspec colspec2 in
266 List.hd (set_columns value_width [[""]]) in
267 let multiple_values =
268 List.hd (set_columns value_width [["!MULTIPLE VALUES"]]) in
270 (* Generate output CSV. *)
271 let keys = List.sort Pervasives.compare keys in
272 let keys = List.map (fun key -> key, []) keys in
273 let csv = List.fold_left (
277 let value = try Hashtbl.find_all hash key with Not_found -> [] in
282 | _::_ -> multiple_values in
283 key, (value :: values)
288 key @ List.flatten (List.rev values)
290 save_out ~separator:output_sep chan csv
292 (* Process the arguments. *)
294 "csvtool - Copyright (C) 2005-2006 Richard W.M. Jones, Merjis Ltd.
296 csvtool is a tool for performing manipulations on CSV files from shell scripts.
299 csvtool [-options] command [command-args] input.csv [input2.csv [...]]
303 Return one or more columns from the CSV file.
305 For <column-spec>, see below.
307 Example: csvtool col 1-3,6 input.csv > output.csv
310 Assuming the first row of the CSV file is a list of column headings,
311 this returned the column(s) with the named headings.
313 <names> is a comma-separated list of names.
315 Example: csvtool namedcol Account,Cost input.csv > output.csv
318 Print the maximum width of the CSV file (number of columns in the
322 Print the number of rows in the CSV file.
325 Print the input CSV in a readable format.
328 This concatenates the input files together and writes them to
329 the output. You can use this to change the separator character.
331 Example: csvtool -t TAB -u , cat input.tsv > output.csv
333 join <column-spec1> <column-spec2>
334 Join (collate) multiple CSV files together.
336 <column-spec1> controls which columns are compared.
338 <column-spec2> controls which columns are copied into the new file.
341 csvtool join 1 2 coll1.csv coll2.csv
342 If coll1.csv contains:
345 and coll2.csv contains:
347 then the output will be:
352 Make the CSV square, so all rows have the same length.
355 Take a square subset of the CSV, top left at row r, column c (counting
356 from 0), which is rows deep and cols wide.
358 replace <column-spec> update.csv original.csv
359 Replace rows in original.csv with rows from update.csv. The columns
360 in <column-spec> only are used to compare rows in input.csv and
361 update.csv to see if they are candidates for replacement.
364 csvtool replace 3 updates.csv original.csv > new.csv
365 mv new.csv original.csv
368 A <column-spec> is a comma-separated list of column numbers
372 1 Column 1 (the first, leftmost column)
373 2,5,7 Columns 2, 5 and 7
374 1-3,5 Columns 1, 2, 3 and 5
375 1,5- Columns 1, 5 and up.
377 Columns are numbered starting from 1 unless the -z option is given.
380 csvtool takes a list of input file(s) from the command line.
383 Normally the output is written to stdout. Use the -o option
387 The default separator character is , (comma). To change this
388 on input or output see the -t and -u options respectively.
390 Use -t TAB or -u TAB (literally T-A-B!) to specify tab-separated
396 let input_sep = ref ',' in
397 let set_input_sep = function
398 | "TAB" -> input_sep := '\t'
399 | "COMMA" -> input_sep := ','
400 | "SPACE" -> input_sep := ' '
401 | s -> input_sep := s.[0]
404 let output_sep = ref ',' in
405 let set_output_sep = function
406 | "TAB" -> output_sep := '\t'
407 | "COMMA" -> output_sep := ','
408 | "SPACE" -> output_sep := ' '
409 | s -> output_sep := s.[0]
412 let count_zero = ref false in
414 let output_file = ref "" in
417 "-t", Arg.String set_input_sep,
418 "Input separator char. Use -t TAB for tab separated input.";
419 "-u", Arg.String set_output_sep,
420 "Output separator char. Use -u TAB for tab separated output.";
421 "-o", Arg.Set_string output_file,
422 "Write output to file (instead of stdout)";
423 "-z", Arg.Set count_zero,
424 "Number columns from 0 instead of 1";
432 Arg.parse argspec set_rest usage;
434 let input_sep = !input_sep in
435 let output_sep = !output_sep in
436 let count_zero = !count_zero in
437 let output_file = !output_file in
438 let rest = List.rev !rest in
440 (* Set up the output file. *)
442 if output_file <> "" then open_out output_file
446 | ("col"|"cols") :: colspec :: files ->
447 let colspec = parse_colspec ~count_zero colspec in
448 cmd_cols ~input_sep ~output_sep ~chan colspec files
449 | ("namedcol"|"namedcols") :: names :: files ->
450 let names = nsplit names "," in
451 cmd_namedcols ~input_sep ~output_sep ~chan names files
452 | "width" :: files ->
453 cmd_width ~input_sep ~chan files
454 | "height" :: files ->
455 cmd_height ~input_sep ~chan files
456 | "readable" :: files ->
457 cmd_readable ~input_sep ~chan files
458 | ("cat"|"concat") :: files ->
459 cmd_cat ~input_sep ~output_sep ~chan files
460 | ("join"|"collate") :: colspec1 :: colspec2 :: ((_::_::_) as files) ->
461 let colspec1 = parse_colspec ~count_zero colspec1 in
462 let colspec2 = parse_colspec ~count_zero colspec2 in
463 cmd_join ~input_sep ~output_sep ~chan colspec1 colspec2 files
464 | "square" :: files ->
465 cmd_square ~input_sep ~output_sep ~chan files
466 | "sub" :: r :: c :: rows :: cols :: files ->
467 let r = int_of_string r in
468 let c = int_of_string c in
469 let rows = int_of_string rows in
470 let cols = int_of_string cols in
471 cmd_sub ~input_sep ~output_sep ~chan r c rows cols files
472 | "replace" :: colspec :: update :: files ->
473 let colspec = parse_colspec ~count_zero colspec in
474 cmd_replace ~input_sep ~output_sep ~chan colspec update files
476 prerr_endline (Sys.executable_name ^ " --help for usage");
480 if output_file <> "" then close_out chan