1 (* csv.ml - comma separated values parser
3 * $Id: csv.ml,v 1.15 2008-10-27 21:57:48 rich Exp $
6 (* The format of CSV files:
8 * Each field starts with either a double quote char or some other
9 * char. For the some other char case things are simple: just read up
10 * to the next comma (,) which marks the end of the field.
12 * In the case where a field begins with a double quote char the
13 * parsing rules are different. Any double quotes are doubled ("") and
14 * we finish reading when we reach an undoubled quote. eg: "The
15 * following is a quote: "", and that's all" is the CSV equivalent of
16 * the following literal field: The following is a quote: ", and that's
19 * "0 is the quoted form of ASCII NUL.
21 * CSV fields can also contain literal carriage return characters, if
22 * they are quoted, eg: "This field
23 * is split over lines" represents a
24 * single field containing a \n.
26 * Excel will only use the quoting format if a field contains a double
27 * quote or comma, although there's no reason why Excel couldn't always
28 * use the quoted format.
30 * The practical upshot of this is that you can't split a line in a CSV
31 * file just by looking at the commas. You need to parse each field
34 * How we represent CSV files:
36 * We load in the whole CSV file at once, and store it internally as a
37 * 'string list list' type (note that each line in the CSV file can,
38 * and often will, have different lengths). We then provide simple
39 * functions to read the CSV file line-by-line, copy it out, or copy a
40 * subset of it into a matrix.
45 (* Uncomment the next line to enable Extlib's List function. These
46 * avoid stack overflows on really huge CSV files.
50 type t = string list list
52 exception Bad_CSV_file of string
54 let rec dropwhile f = function
56 | x :: xs when f x -> dropwhile f xs
60 let rec drop n = function
61 | _ :: l when n > 0 -> drop (n-1) l
64 let rec take n = function
65 | x :: xs when n > 0 -> x :: take (pred n) xs
68 let lines = List.length
71 List.fold_left max 0 (List.map List.length csv)
73 type state_t = StartField
76 | InQuotedFieldAfterQuote
78 let load_rows ?(separator = ',') f chan =
79 let row = ref [] in (* Current row. *)
80 let field = ref [] in (* Current field. *)
81 let state = ref StartField in (* Current state. *)
83 let field_list = List.rev !field in
84 let field_len = List.length field_list in
85 let field_str = String.create field_len in
86 let rec loop i = function
93 row := field_str :: !row;
103 let row_list = List.rev !row in
109 let c = input_char chan in
110 if c != '\r' then ( (* Always ignore \r characters. *)
112 StartField -> (* Expecting quote or other char. *)
114 state := InQuotedField;
116 ) else if c = separator then (* Empty field. *)
118 else if c = '\n' then ( (* Empty field, end of row. *)
122 state := InUnquotedField;
125 | InUnquotedField -> (* Reading chars to end of field. *)
126 if c = separator then (* End of field. *)
128 else if c = '\n' then ( (* End of field and end of row. *)
133 | InQuotedField -> (* Reading chars to end of field. *)
135 state := InQuotedFieldAfterQuote
138 | InQuotedFieldAfterQuote ->
139 if c = '"' then ( (* Doubled quote. *)
140 field := c :: !field;
141 state := InQuotedField
142 ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *)
143 field := '\000' :: !field;
144 state := InQuotedField
145 ) else if c = separator then (* End of field. *)
147 else if c = '\n' then ( (* End of field and end of row. *)
150 ) else ( (* Bad single quote in field. *)
151 field := c :: '"' :: !field;
152 state := InQuotedField
154 ); (* end of match *)
161 (* Any part left to write out? *)
165 ( empty_field (); end_of_row () )
166 | InUnquotedField | InQuotedFieldAfterQuote ->
167 end_of_field (); end_of_row ()
169 raise (Bad_CSV_file "Missing end quote after quoted field.")
172 let load_in ?separator chan =
177 load_rows ?separator f chan;
180 let load ?separator filename =
183 | "-" -> stdin, false
184 | filename -> open_in filename, true in
185 let csv = load_in ?separator chan in
186 if close then close_in chan;
189 let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv =
190 let rec empty_row = function
192 | x :: xs when x <> "" -> false
193 | x :: xs -> empty_row xs
195 let csv = if top then dropwhile empty_row csv else csv in
199 let row = List.rev row in
200 let row = dropwhile ((=) "") row in
201 let row = List.rev row in
206 let csv = List.rev csv in
207 let csv = dropwhile empty_row csv in
208 let csv = List.rev csv in
212 let empty_left_cell =
213 function [] -> true | x :: xs when x = "" -> true | _ -> false in
215 List.fold_left (fun a row -> a && empty_left_cell row) true in
216 let remove_left_col =
217 List.map (function [] -> [] | x :: xs -> xs) in
219 if empty_left_col csv then
225 let csv = if left then loop csv else csv in
230 let columns = columns csv in
233 let n = List.length row in
234 let row = List.rev row in
235 let rec loop acc = function
237 | i -> "" :: loop acc (i-1)
239 let row = loop row (columns - n) in
244 let columns = columns csv in
245 List.for_all (fun row -> List.length row = columns) csv
247 let rec set_columns cols = function
250 let rec loop i cells =
253 | [] -> "" :: loop (succ i) []
254 | c :: cs -> c :: loop (succ i) cs
258 loop 0 r :: set_columns cols rs
260 let rec set_rows rows csv =
263 | [] -> [] :: set_rows (pred rows) []
264 | r :: rs -> r :: set_rows (pred rows) rs
268 let set_size rows cols csv =
269 set_columns cols (set_rows rows csv)
271 let sub r c rows cols csv =
272 let csv = drop r csv in
273 let csv = List.map (drop c) csv in
274 let csv = set_rows rows csv in
275 let csv = set_columns cols csv in
278 (* Compare two rows for semantic equality - ignoring any blank cells
279 * at the end of each row.
281 let rec compare_row (row1 : string list) row2 =
282 match row1, row2 with
284 | x :: xs, y :: ys ->
285 let c = compare x y in
286 if c <> 0 then c else compare_row xs ys
296 (* Semantic equality for CSV files. *)
297 let rec compare (csv1 : t) csv2 =
298 match csv1, csv2 with
300 | x :: xs, y :: ys ->
301 let c = compare_row x y in
302 if c <> 0 then c else compare xs ys
304 let c = compare_row x [] in
305 if c <> 0 then c else compare xs []
307 let c = compare_row [] y in
308 if c <> 0 then c else compare [] ys
310 (* Concatenate - arrange left to right. *)
311 let rec concat = function
314 | left_csv :: csvs ->
315 (* Concatenate the remaining CSV files. *)
316 let right_csv = concat csvs in
318 (* Set the height of the left and right CSVs to the same. *)
319 let nr_rows = max (lines left_csv) (lines right_csv) in
320 let left_csv = set_rows nr_rows left_csv in
321 let right_csv = set_rows nr_rows right_csv in
323 (* Square off the left CSV. *)
324 let left_csv = square left_csv in
326 (* Prepend the right CSV rows with the left CSV rows. *)
328 fun (left_row, right_row) -> List.append left_row right_row
329 ) (List.combine left_csv right_csv)
332 Array.of_list (List.map Array.of_list csv)
335 List.map Array.to_list (Array.to_list csv)
337 let associate header data =
338 let nr_cols = List.length header in
339 let rec trunc = function
341 | n, [] -> "" :: trunc (n-1, [])
342 | n, (x :: xs) -> x :: trunc (n-1, xs)
346 let row = trunc (nr_cols, row) in
347 List.combine header row
350 let save_out ?(separator = ',') chan csv =
351 (* Quote a single CSV field. *)
352 let quote_field field =
353 if String.contains field separator ||
354 String.contains field '\"' ||
355 String.contains field '\n'
357 let buffer = Buffer.create 100 in
358 Buffer.add_char buffer '\"';
359 for i = 0 to (String.length field) - 1 do
361 '\"' -> Buffer.add_string buffer "\"\""
362 | c -> Buffer.add_char buffer c
364 Buffer.add_char buffer '\"';
365 Buffer.contents buffer
371 let separator = String.make 1 separator in
372 List.iter (fun line ->
373 output_string chan (String.concat separator
374 (List.map quote_field line));
375 output_char chan '\n') csv
377 let print ?separator csv =
378 save_out ?separator stdout csv; flush stdout
380 let save ?separator file csv =
381 let chan = open_out file in
382 save_out ?separator chan csv;
385 let save_out_readable chan csv =
386 (* Escape all the strings in the CSV file first. *)
387 (* XXX Why are we doing this? I commented it out anyway.
388 let csv = List.map (List.map String.escaped) csv in
391 (* Find the width of each column. *)
393 (* Don't consider rows with only a single element - typically
396 let csv = List.filter (function [_] -> false | _ -> true) csv in
398 (* Square the CSV file - makes the next step simpler to implement. *)
399 let csv = square csv in
404 let lengths_row1 = List.map String.length row1 in
405 let lengths_rest = List.map (List.map String.length) rest in
408 try List.combine r1 r2
410 Invalid_argument "List.combine" ->
411 failwith (sprintf "Csv.save_out_readable: internal error: length r1 = %d, length r2 = %d" (List.length r1) (List.length r2)) in
412 List.map (fun ((a : int), (b : int)) -> max a b) rp
414 List.fold_left max2rows lengths_row1 lengths_rest in
416 (* Print out each cell at the correct width. *)
417 let rec repeat f = function
419 | i -> f (); repeat f (i-1)
423 | [cell] -> (* Single column. *)
424 output_string chan cell;
425 output_char chan '\n'
426 | row -> (* Other. *)
427 (* Pair up each cell with its max width. *)
429 let rec loop = function
431 | (_, []) -> failwith "Csv.save_out_readable: internal error"
432 | (cell :: cells, width :: widths) ->
433 (cell, width) :: loop (cells, widths)
435 loop (row, widths) in
438 output_string chan cell;
439 let n = String.length cell in
440 repeat (fun () -> output_char chan ' ') (width - n + 1)
442 output_char chan '\n'
445 let print_readable = save_out_readable stdout