1 (* csv.ml - comma separated values parser
3 * $Id: csv.ml,v 1.2 2004-12-06 17:40:50 rich Exp $
6 (* The format of CSV files:
8 * Each field starts with either a double quote char or some other
9 * char. For the some other char case things are simple: just read up
10 * to the next comma (,) which marks the end of the field.
12 * In the case where a field begins with a double quote char the
13 * parsing rules are different. Any double quotes are doubled ("") and
14 * we finish reading when we reach an undoubled quote. eg: "The
15 * following is a quote: "", and that's all" is the CSV equivalent of
16 * the following literal field: The following is a quote: ", and that's
19 * "0 is the quoted form of ASCII NUL.
21 * CSV fields can also contain literal carriage return characters, if
22 * they are quoted, eg: "This field
23 * is split over lines" represents a
24 * single field containing a \n.
26 * Excel will only use the quoting format if a field contains a double
27 * quote or comma, although there's no reason why Excel couldn't always
28 * use the quoted format.
30 * The practical upshot of this is that you can't split a line in a CSV
31 * file just by looking at the commas. You need to parse each field
34 * How we represent CSV files:
36 * We load in the whole CSV file at once, and store it internally as a
37 * 'string list list' type (note that each line in the CSV file can,
38 * and often will, have different lengths). We then provide simple
39 * functions to read the CSV file line-by-line, copy it out, or copy a
40 * subset of it into a matrix.
43 type t = string list list
45 exception Bad_CSV_file of string
47 let rec dropwhile f = function
49 | x :: xs when f x -> dropwhile f xs
52 let lines = List.length
55 List.fold_left max 0 (List.map List.length csv)
57 type state_t = StartField
60 | InQuotedFieldAfterQuote
62 let load_rows f chan =
63 let row = ref [] in (* Current row. *)
64 let field = ref [] in (* Current field. *)
65 let state = ref StartField in (* Current state. *)
67 let field_list = List.rev !field in
68 let field_len = List.length field_list in
69 let field_str = String.create field_len in
70 let rec loop i = function
77 row := field_str :: !row;
87 let row_list = List.rev !row in
93 let c = input_char chan in
94 if c != '\r' then ( (* Always ignore \r characters. *)
96 StartField -> (* Expecting quote or other char. *)
98 state := InQuotedField;
100 ) else if c = ',' then (* Empty field. *)
102 else if c = '\n' then ( (* Empty field, end of row. *)
106 state := InUnquotedField;
109 | InUnquotedField -> (* Reading chars to end of field. *)
110 if c = ',' then (* End of field. *)
112 else if c = '\n' then ( (* End of field and end of row. *)
117 | InQuotedField -> (* Reading chars to end of field. *)
119 state := InQuotedFieldAfterQuote
122 | InQuotedFieldAfterQuote ->
123 if c = '\"' then ( (* Doubled quote. *)
124 field := c :: !field;
125 state := InQuotedField
126 ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *)
127 field := '\000' :: !field;
128 state := InQuotedField
129 ) else if c = ',' then (* End of field. *)
131 else if c = '\n' then ( (* End of field and end of row. *)
135 ); (* end of match *)
142 (* Any part left to write out? *)
146 ( empty_field (); end_of_row () )
147 | InUnquotedField | InQuotedFieldAfterQuote ->
148 end_of_field (); end_of_row ()
150 raise (Bad_CSV_file "Missing end quote after quoted field.")
162 let chan = open_in filename in
163 let csv = load_in chan in
167 let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv =
168 let rec empty_row = function
170 | x :: xs when x <> "" -> false
171 | x :: xs -> empty_row xs
173 let csv = if top then dropwhile empty_row csv else csv in
177 let row = List.rev row in
178 let row = dropwhile ((=) "") row in
179 let row = List.rev row in
184 let csv = List.rev csv in
185 let csv = dropwhile empty_row csv in
186 let csv = List.rev csv in
190 let empty_left_cell =
191 function [] -> true | x :: xs when x = "" -> true | _ -> false in
193 List.fold_left (fun a row -> a && empty_left_cell row) true in
194 let remove_left_col =
195 List.map (function [] -> [] | x :: xs -> xs) in
197 if empty_left_col csv then (
198 let csv = remove_left_col csv in
203 let csv = if left then loop csv else csv in
207 (* Quote a single CSV field. *)
208 let quote_field field =
209 if String.contains field ',' ||
210 String.contains field '\"' ||
211 String.contains field '\n'
213 let buffer = Buffer.create 100 in
214 Buffer.add_char buffer '\"';
215 for i = 0 to (String.length field) - 1 do
217 '\"' -> Buffer.add_string buffer "\"\""
218 | c -> Buffer.add_char buffer c
220 Buffer.add_char buffer '\"';
221 Buffer.contents buffer
226 let save_out chan csv =
227 List.iter (fun line ->
228 output_string chan (String.concat ","
229 (List.map quote_field line));
230 output_char chan '\n') csv
236 let chan = open_out file in