1 (* csv.ml - comma separated values parser
3 * $Id: csv.ml,v 1.7 2005-11-25 14:06:58 rich Exp $
6 (* The format of CSV files:
8 * Each field starts with either a double quote char or some other
9 * char. For the some other char case things are simple: just read up
10 * to the next comma (,) which marks the end of the field.
12 * In the case where a field begins with a double quote char the
13 * parsing rules are different. Any double quotes are doubled ("") and
14 * we finish reading when we reach an undoubled quote. eg: "The
15 * following is a quote: "", and that's all" is the CSV equivalent of
16 * the following literal field: The following is a quote: ", and that's
19 * "0 is the quoted form of ASCII NUL.
21 * CSV fields can also contain literal carriage return characters, if
22 * they are quoted, eg: "This field
23 * is split over lines" represents a
24 * single field containing a \n.
26 * Excel will only use the quoting format if a field contains a double
27 * quote or comma, although there's no reason why Excel couldn't always
28 * use the quoted format.
30 * The practical upshot of this is that you can't split a line in a CSV
31 * file just by looking at the commas. You need to parse each field
34 * How we represent CSV files:
36 * We load in the whole CSV file at once, and store it internally as a
37 * 'string list list' type (note that each line in the CSV file can,
38 * and often will, have different lengths). We then provide simple
39 * functions to read the CSV file line-by-line, copy it out, or copy a
40 * subset of it into a matrix.
43 type t = string list list
45 exception Bad_CSV_file of string
47 let rec dropwhile f = function
49 | x :: xs when f x -> dropwhile f xs
53 let rec drop n = function
54 | _ :: l when n > 0 -> drop (n-1) l
57 let rec take n = function
58 | x :: xs when n > 0 -> x :: take (pred n) xs
61 let lines = List.length
64 List.fold_left max 0 (List.map List.length csv)
66 type state_t = StartField
69 | InQuotedFieldAfterQuote
71 let load_rows ?(separator = ',') f chan =
72 let row = ref [] in (* Current row. *)
73 let field = ref [] in (* Current field. *)
74 let state = ref StartField in (* Current state. *)
76 let field_list = List.rev !field in
77 let field_len = List.length field_list in
78 let field_str = String.create field_len in
79 let rec loop i = function
86 row := field_str :: !row;
96 let row_list = List.rev !row in
102 let c = input_char chan in
103 if c != '\r' then ( (* Always ignore \r characters. *)
105 StartField -> (* Expecting quote or other char. *)
107 state := InQuotedField;
109 ) else if c = separator then (* Empty field. *)
111 else if c = '\n' then ( (* Empty field, end of row. *)
115 state := InUnquotedField;
118 | InUnquotedField -> (* Reading chars to end of field. *)
119 if c = separator then (* End of field. *)
121 else if c = '\n' then ( (* End of field and end of row. *)
126 | InQuotedField -> (* Reading chars to end of field. *)
128 state := InQuotedFieldAfterQuote
131 | InQuotedFieldAfterQuote ->
132 if c = '"' then ( (* Doubled quote. *)
133 field := c :: !field;
134 state := InQuotedField
135 ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *)
136 field := '\000' :: !field;
137 state := InQuotedField
138 ) else if c = separator then (* End of field. *)
140 else if c = '\n' then ( (* End of field and end of row. *)
143 ) else ( (* Bad single quote in field. *)
144 field := c :: '"' :: !field;
145 state := InQuotedField
147 ); (* end of match *)
154 (* Any part left to write out? *)
158 ( empty_field (); end_of_row () )
159 | InUnquotedField | InQuotedFieldAfterQuote ->
160 end_of_field (); end_of_row ()
162 raise (Bad_CSV_file "Missing end quote after quoted field.")
165 let load_in ?separator chan =
170 load_rows ?separator f chan;
173 let load ?separator filename =
174 let chan = open_in filename in
175 let csv = load_in ?separator chan in
179 let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv =
180 let rec empty_row = function
182 | x :: xs when x <> "" -> false
183 | x :: xs -> empty_row xs
185 let csv = if top then dropwhile empty_row csv else csv in
189 let row = List.rev row in
190 let row = dropwhile ((=) "") row in
191 let row = List.rev row in
196 let csv = List.rev csv in
197 let csv = dropwhile empty_row csv in
198 let csv = List.rev csv in
202 let empty_left_cell =
203 function [] -> true | x :: xs when x = "" -> true | _ -> false in
205 List.fold_left (fun a row -> a && empty_left_cell row) true in
206 let remove_left_col =
207 List.map (function [] -> [] | x :: xs -> xs) in
209 if empty_left_col csv then (
210 let csv = remove_left_col csv in
215 let csv = if left then loop csv else csv in
220 let columns = columns csv in
223 let n = List.length row in
224 let row = List.rev row in
225 let rec loop acc = function
227 | i -> "" :: loop acc (i-1)
229 let row = loop row (columns - n) in
234 let columns = columns csv in
235 List.for_all (fun row -> List.length row = columns) csv
237 let rec set_columns cols = function
240 let rec loop i cells =
243 | [] -> "" :: loop (succ i) []
244 | c :: cs -> c :: loop (succ i) cs
248 loop 0 r :: set_columns cols rs
250 let rec set_rows rows csv =
253 | [] -> [] :: set_rows (pred rows) []
254 | r :: rs -> r :: set_rows (pred rows) rs
258 let set_size rows cols csv =
259 set_columns cols (set_rows rows csv)
261 let sub r c rows cols csv =
262 let csv = drop r csv in
263 let csv = List.map (drop c) csv in
264 let csv = set_rows rows csv in
265 let csv = set_columns cols csv in
269 Array.of_list (List.map Array.of_list csv)
272 List.map Array.to_list (Array.to_list csv)
274 let associate header data =
275 let nr_cols = List.length header in
276 let rec trunc = function
278 | n, [] -> "" :: trunc (n-1, [])
279 | n, (x :: xs) -> x :: trunc (n-1, xs)
283 let row = trunc (nr_cols, row) in
284 List.combine header row
287 let save_out ?(separator = ',') chan csv =
288 (* Quote a single CSV field. *)
289 let quote_field field =
290 if String.contains field separator ||
291 String.contains field '\"' ||
292 String.contains field '\n'
294 let buffer = Buffer.create 100 in
295 Buffer.add_char buffer '\"';
296 for i = 0 to (String.length field) - 1 do
298 '\"' -> Buffer.add_string buffer "\"\""
299 | c -> Buffer.add_char buffer c
301 Buffer.add_char buffer '\"';
302 Buffer.contents buffer
308 let separator = String.make 1 separator in
309 List.iter (fun line ->
310 output_string chan (String.concat separator
311 (List.map quote_field line));
312 output_char chan '\n') csv
314 let print ?separator csv =
315 save_out ?separator stdout csv; flush stdout
317 let save ?separator file csv =
318 let chan = open_out file in
319 save_out ?separator chan csv;
322 let save_out_readable chan csv =
323 (* Escape all the strings in the CSV file first. *)
324 let csv = List.map (List.map String.escaped) csv in
326 let csv = square csv in
328 (* Find the width of each column. *)
333 let n = List.length r in
334 let lengths = List.map (List.map String.length) csv in
336 let rp = List.combine r1 r2 in
337 List.map (fun ((a : int), (b : int)) -> max a b) rp
339 let rec repeat x = function
341 | i -> x :: repeat x (i-1)
343 List.fold_left max2rows (repeat 0 n) lengths in
345 (* Print out each cell at the correct width. *)
346 let rec repeat f = function
348 | i -> f (); repeat f (i-1)
352 let row = List.combine widths row in
355 output_string chan cell;
356 let n = String.length cell in
357 repeat (fun () -> output_char chan ' ') (width - n + 1)
359 output_char chan '\n'
362 let print_readable = save_out_readable stdout