--- /dev/null
+(* csv.ml - comma separated values parser
+ *
+ * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $
+ *)
+
+(* The format of CSV files:
+ *
+ * Each field starts with either a double quote char or some other
+ * char. For the some other char case things are simple: just read up
+ * to the next comma (,) which marks the end of the field.
+ *
+ * In the case where a field begins with a double quote char the
+ * parsing rules are different. Any double quotes are doubled ("") and
+ * we finish reading when we reach an undoubled quote. eg: "The
+ * following is a quote: "", and that's all" is the CSV equivalent of
+ * the following literal field: The following is a quote: ", and that's
+ * all
+ *
+ * CSV fields can also contain literal carriage return characters, if
+ * they are quoted, eg: "This field is split over lines" represents a
+ * single field containing a \n.
+ *
+ * Excel will only use the quoting format if a field contains a double
+ * quote or comma, although there's no reason why Excel couldn't always
+ * use the quoted format.
+ *
+ * The practical upshot of this is that you can't split a line in a CSV
+ * file just by looking at the commas. You need to parse each field
+ * separately.
+ *
+ * How we represent CSV files:
+ *
+ * We load in the whole CSV file at once, and store it internally as a
+ * 'string list list' type (note that each line in the CSV file can,
+ * and often will, have different lengths). We then provide simple
+ * functions to read the CSV file line-by-line, copy it out, or copy a
+ * subset of it into a matrix.
+ *
+ * For future work: According to the Text::CSV_XS manual page, "0 is a
+ * valid encoding, within quoted fields, of the ASCII NUL character. In
+ * Unix this character could, of course, be encoded directly in the
+ * file.
+ *)
+
+type t = string list list
+
+exception Bad_CSV_file of string
+
+let lines = List.length
+
+let columns csv =
+ List.fold_left max 0 (List.map List.length csv)
+
+type state_t = StartField
+ | InUnquotedField
+ | InQuotedField
+ | InQuotedFieldAfterQuote
+
+let load_rows f chan =
+ let row = ref [] in (* Current row. *)
+ let field = ref [] in (* Current field. *)
+ let state = ref StartField in (* Current state. *)
+ let end_of_field () =
+ let field_list = List.rev !field in
+ let field_len = List.length field_list in
+ let field_str = String.create field_len in
+ let rec loop i = function
+ [] -> ()
+ | x :: xs ->
+ field_str.[i] <- x;
+ loop (i+1) xs
+ in
+ loop 0 field_list;
+ row := field_str :: !row;
+ field := [];
+ state := StartField
+ in
+ let empty_field () =
+ row := "" :: !row;
+ field := [];
+ state := StartField
+ in
+ let end_of_row () =
+ let row_list = List.rev !row in
+ f row_list;
+ row := [];
+ state := StartField
+ in
+ let rec loop () =
+ let c = input_char chan in
+ if c != '\r' then ( (* Always ignore \r characters. *)
+ match !state with
+ StartField -> (* Expecting quote or other char. *)
+ if c = '\"' then (
+ state := InQuotedField;
+ field := []
+ ) else if c = ',' then (* Empty field. *)
+ empty_field ()
+ else if c = '\n' then ( (* Empty field, end of row. *)
+ empty_field ();
+ end_of_row ()
+ ) else (
+ state := InUnquotedField;
+ field := [c]
+ )
+ | InUnquotedField -> (* Reading chars to end of field. *)
+ if c = ',' then (* End of field. *)
+ end_of_field ()
+ else if c = '\n' then ( (* End of field and end of row. *)
+ end_of_field ();
+ end_of_row ()
+ ) else
+ field := c :: !field
+ | InQuotedField -> (* Reading chars to end of field. *)
+ if c = '\"' then
+ state := InQuotedFieldAfterQuote
+ else
+ field := c :: !field
+ | InQuotedFieldAfterQuote ->
+ if c = '\"' then ( (* Doubled quote. *)
+ field := c :: !field;
+ state := InQuotedField
+ ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *)
+ field := '\000' :: !field;
+ state := InQuotedField
+ ) else if c = ',' then (* End of field. *)
+ end_of_field ()
+ else if c = '\n' then ( (* End of field and end of row. *)
+ end_of_field ();
+ end_of_row ()
+ )
+ ); (* end of match *)
+ loop ()
+ in
+ try
+ loop ()
+ with
+ End_of_file ->
+ (* Any part left to write out? *)
+ (match !state with
+ StartField ->
+ if !row <> [] then
+ ( empty_field (); end_of_row () )
+ | InUnquotedField | InQuotedFieldAfterQuote ->
+ end_of_field (); end_of_row ()
+ | InQuotedField ->
+ raise (Bad_CSV_file "Missing end quote after quoted field.")
+ )
+
+let load_in chan =
+ let csv = ref [] in
+ let f row =
+ csv := row :: !csv
+ in
+ load_rows f chan;
+ List.rev !csv
+
+let load filename =
+ let chan = open_in filename in
+ let csv = load_in chan in
+ close_in chan;
+ csv
+
+(* Quote a single CSV field. *)
+let quote_field field =
+ if String.contains field ',' ||
+ String.contains field '\"' ||
+ String.contains field '\n'
+ then (
+ let buffer = Buffer.create 100 in
+ Buffer.add_char buffer '\"';
+ for i = 0 to (String.length field) - 1 do
+ match field.[i] with
+ '\"' -> Buffer.add_string buffer "\"\""
+ | c -> Buffer.add_char buffer c
+ done;
+ Buffer.add_char buffer '\"';
+ Buffer.contents buffer
+ )
+ else
+ field
+
+let save_out chan csv =
+ List.iter (fun line ->
+ output_string chan (String.concat ","
+ (List.map quote_field line));
+ output_char chan '\n') csv
+
+let print csv =
+ save_out stdout csv
+
+let save file csv =
+ let chan = open_out file in
+ save_out chan csv;
+ close_out chan
--- /dev/null
+(** csv.mli - comma separated values parser
+ *
+ * $Id: csv.mli,v 1.1 2003-12-17 16:05:08 rich Exp $
+ *)
+
+type t = string list list
+(** Representation of CSV files. *)
+
+exception Bad_CSV_file of string
+(** Badly formed CSV files throw this exception: *)
+
+val lines : t -> int
+(** Work out the number of lines in a CSV file. *)
+
+val columns : t -> int
+(** Work out the (maximum) number of columns in a CSV file. Note that each
+ line may be a different length, so this finds the one with the most
+ columns. *)
+
+val load_in : in_channel -> t
+(** Load a CSV file.
+ * @param chan Input file stream
+ *)
+
+val load : string -> t
+(** Load a CSV file.
+ * @param filename CSV filename.
+ *)
+
+val load_rows : (string list -> unit) -> in_channel -> unit
+(** For very large CSV files which cannot be processed in memory at once,
+ * this function is appropriate. It parses the input one row at a time and
+ * calls your function once for each row.
+ *
+ * @param f Callout function.
+ * @param chan Input file stream.
+ *)
+
+val print : t -> unit
+(** Print string list list - same as [save_out stdout] *)
+
+val save_out : out_channel -> t -> unit
+(** Save string list list to a channel. *)
+
+val save : string -> t -> unit
+(** Save string list list to a file. *)
--- /dev/null
+(* $Id: test.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ *)
+
+open Printf
+open Csv
+
+let do_testcsv filename expected =
+ let csv = load filename in
+ if csv <> expected then (
+ printf "input file: %s\n" filename;
+ printf "Csv library produced:\n";
+ print csv;
+ printf "Expected:\n";
+ print expected;
+ failwith "failed"
+ )
+
+let testcsv1 =
+ do_testcsv
+ "testcsv1.csv"
+ [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns." ] ]
+let testcsv2 =
+ do_testcsv
+ "testcsv2.csv"
+ [ [ "Normal field"; "Quoted field"; "Quoted field with \"\" quotes" ] ]
+let testcsv3 =
+ do_testcsv
+ "testcsv3.csv"
+ [ [ "" ];
+ [ ""; "" ];
+ [ ""; ""; "" ];
+ [ ""; ""; ""; "" ];
+ [ ""; ""; ""; ""; "" ] ]
+let testcsv4 =
+ do_testcsv
+ "testcsv4.csv"
+ []
+let testcsv5 =
+ do_testcsv
+ "testcsv5.csv"
+ [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns.";
+ "a second field"; "a third field" ];
+ [ "a fourth field on a new line" ] ]
+let testcsv6 =
+ do_testcsv
+ "testcsv6.csv"
+ [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns\nand \000";
+ "a second field"; "a third field" ];
+ [ "a fourth field on a new line" ] ]
+
+;;
+
+print_endline "All tests succeeded."