1 (* csv.ml - comma separated values parser
3 * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $
6 (* The format of CSV files:
8 * Each field starts with either a double quote char or some other
9 * char. For the some other char case things are simple: just read up
10 * to the next comma (,) which marks the end of the field.
12 * In the case where a field begins with a double quote char the
13 * parsing rules are different. Any double quotes are doubled ("") and
14 * we finish reading when we reach an undoubled quote. eg: "The
15 * following is a quote: "", and that's all" is the CSV equivalent of
16 * the following literal field: The following is a quote: ", and that's
19 * CSV fields can also contain literal carriage return characters, if
20 * they are quoted, eg: "This field is split over lines" represents a
21 * single field containing a \n.
23 * Excel will only use the quoting format if a field contains a double
24 * quote or comma, although there's no reason why Excel couldn't always
25 * use the quoted format.
27 * The practical upshot of this is that you can't split a line in a CSV
28 * file just by looking at the commas. You need to parse each field
31 * How we represent CSV files:
33 * We load in the whole CSV file at once, and store it internally as a
34 * 'string list list' type (note that each line in the CSV file can,
35 * and often will, have different lengths). We then provide simple
36 * functions to read the CSV file line-by-line, copy it out, or copy a
37 * subset of it into a matrix.
39 * For future work: According to the Text::CSV_XS manual page, "0 is a
40 * valid encoding, within quoted fields, of the ASCII NUL character. In
41 * Unix this character could, of course, be encoded directly in the
45 type t = string list list
47 exception Bad_CSV_file of string
49 let lines = List.length
52 List.fold_left max 0 (List.map List.length csv)
54 type state_t = StartField
57 | InQuotedFieldAfterQuote
59 let load_rows f chan =
60 let row = ref [] in (* Current row. *)
61 let field = ref [] in (* Current field. *)
62 let state = ref StartField in (* Current state. *)
64 let field_list = List.rev !field in
65 let field_len = List.length field_list in
66 let field_str = String.create field_len in
67 let rec loop i = function
74 row := field_str :: !row;
84 let row_list = List.rev !row in
90 let c = input_char chan in
91 if c != '\r' then ( (* Always ignore \r characters. *)
93 StartField -> (* Expecting quote or other char. *)
95 state := InQuotedField;
97 ) else if c = ',' then (* Empty field. *)
99 else if c = '\n' then ( (* Empty field, end of row. *)
103 state := InUnquotedField;
106 | InUnquotedField -> (* Reading chars to end of field. *)
107 if c = ',' then (* End of field. *)
109 else if c = '\n' then ( (* End of field and end of row. *)
114 | InQuotedField -> (* Reading chars to end of field. *)
116 state := InQuotedFieldAfterQuote
119 | InQuotedFieldAfterQuote ->
120 if c = '\"' then ( (* Doubled quote. *)
121 field := c :: !field;
122 state := InQuotedField
123 ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *)
124 field := '\000' :: !field;
125 state := InQuotedField
126 ) else if c = ',' then (* End of field. *)
128 else if c = '\n' then ( (* End of field and end of row. *)
132 ); (* end of match *)
139 (* Any part left to write out? *)
143 ( empty_field (); end_of_row () )
144 | InUnquotedField | InQuotedFieldAfterQuote ->
145 end_of_field (); end_of_row ()
147 raise (Bad_CSV_file "Missing end quote after quoted field.")
159 let chan = open_in filename in
160 let csv = load_in chan in
164 (* Quote a single CSV field. *)
165 let quote_field field =
166 if String.contains field ',' ||
167 String.contains field '\"' ||
168 String.contains field '\n'
170 let buffer = Buffer.create 100 in
171 Buffer.add_char buffer '\"';
172 for i = 0 to (String.length field) - 1 do
174 '\"' -> Buffer.add_string buffer "\"\""
175 | c -> Buffer.add_char buffer c
177 Buffer.add_char buffer '\"';
178 Buffer.contents buffer
183 let save_out chan csv =
184 List.iter (fun line ->
185 output_string chan (String.concat ","
186 (List.map quote_field line));
187 output_char chan '\n') csv
193 let chan = open_out file in