From 677587e4828c264c36a2ddce644697c9d4b8ac06 Mon Sep 17 00:00:00 2001 From: rich Date: Wed, 17 Dec 2003 16:05:08 +0000 Subject: [PATCH] Added CSV parsing library, and fixed handling of "0 (ASCII NUL). --- .cvsignore | 6 ++ .depend | 4 ++ Makefile | 54 +++++++++++++++++ csv.ml | 195 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ csv.mli | 46 ++++++++++++++ test.ml | 52 ++++++++++++++++ testcsv1.csv | 4 ++ testcsv2.csv | 1 + testcsv3.csv | 5 ++ testcsv4.csv | 0 testcsv5.csv | 5 ++ testcsv6.csv | 6 ++ 12 files changed, 378 insertions(+) create mode 100644 .cvsignore create mode 100644 .depend create mode 100644 Makefile create mode 100644 csv.ml create mode 100644 csv.mli create mode 100644 test.ml create mode 100644 testcsv1.csv create mode 100644 testcsv2.csv create mode 100644 testcsv3.csv create mode 100644 testcsv4.csv create mode 100644 testcsv5.csv create mode 100644 testcsv6.csv diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 0000000..b6131c8 --- /dev/null +++ b/.cvsignore @@ -0,0 +1,6 @@ +*.cma +*.cmi +*.cmo +*.cmx +*.cmxa +test \ No newline at end of file diff --git a/.depend b/.depend new file mode 100644 index 0000000..d77da96 --- /dev/null +++ b/.depend @@ -0,0 +1,4 @@ +csv.cmo: csv.cmi +csv.cmx: csv.cmi +test.cmo: csv.cmi +test.cmx: csv.cmx diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..09f55b8 --- /dev/null +++ b/Makefile @@ -0,0 +1,54 @@ +# $Id: Makefile,v 1.1 2003-12-17 16:05:08 rich Exp $ + +OCAMLC := ocamlc +OCAMLCINCS := +OCAMLCFLAGS := -w s -g +OCAMLCLIBS := + +OCAMLOPT := ocamlopt +OCAMLOPTINCS := $(OCAMLCINCS) +OCAMLOPTFLAGS := -w s +OCAMLOPTLIBS := + +OBJS := csv.cmo +XOBJS := $(OBJS:.cmo=.cmx) + +all: csv.cma csv.cmxa + +csv.cma: $(OBJS) + $(OCAMLC) $(OCAMLCFLAGS) -a -o $@ $^ + +csv.cmxa: $(XOBJS) + $(OCAMLOPT) $(OCAMLOPTFLAGS) -a -o $@ $^ + +test: csv.cma test.ml + $(OCAMLC) csv.cma test.ml -o test + ./test + +# Common rules for building OCaml objects. + +.mli.cmi: + $(OCAMLC) $(OCAMLCFLAGS) $(OCAMLCINCS) -c $< +.ml.cmo: + $(OCAMLC) $(OCAMLCFLAGS) $(OCAMLCINCS) -c $< +.ml.cmx: + $(OCAMLOPT) $(OCAMLOPTFLAGS) $(OCAMLOPTINCS) -c $< + +# Clean. + +clean: + rm -f *.cmi *.cmo *.cmx *.cma *.cmxa *.o *~ core + +# Dependencies. + +depend: .depend + +.depend: $(wildcard *.mli) $(wildcard *.ml) + rm -f .depend + ocamldep $^ > $@ + +ifeq ($(wildcard .depend),.depend) +include .depend +endif + +.SUFFIXES: .cmo .cmi .cmx .ml .mli diff --git a/csv.ml b/csv.ml new file mode 100644 index 0000000..b5c392b --- /dev/null +++ b/csv.ml @@ -0,0 +1,195 @@ +(* csv.ml - comma separated values parser + * + * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ + *) + +(* The format of CSV files: + * + * Each field starts with either a double quote char or some other + * char. For the some other char case things are simple: just read up + * to the next comma (,) which marks the end of the field. + * + * In the case where a field begins with a double quote char the + * parsing rules are different. Any double quotes are doubled ("") and + * we finish reading when we reach an undoubled quote. eg: "The + * following is a quote: "", and that's all" is the CSV equivalent of + * the following literal field: The following is a quote: ", and that's + * all + * + * CSV fields can also contain literal carriage return characters, if + * they are quoted, eg: "This field is split over lines" represents a + * single field containing a \n. + * + * Excel will only use the quoting format if a field contains a double + * quote or comma, although there's no reason why Excel couldn't always + * use the quoted format. + * + * The practical upshot of this is that you can't split a line in a CSV + * file just by looking at the commas. You need to parse each field + * separately. + * + * How we represent CSV files: + * + * We load in the whole CSV file at once, and store it internally as a + * 'string list list' type (note that each line in the CSV file can, + * and often will, have different lengths). We then provide simple + * functions to read the CSV file line-by-line, copy it out, or copy a + * subset of it into a matrix. + * + * For future work: According to the Text::CSV_XS manual page, "0 is a + * valid encoding, within quoted fields, of the ASCII NUL character. In + * Unix this character could, of course, be encoded directly in the + * file. + *) + +type t = string list list + +exception Bad_CSV_file of string + +let lines = List.length + +let columns csv = + List.fold_left max 0 (List.map List.length csv) + +type state_t = StartField + | InUnquotedField + | InQuotedField + | InQuotedFieldAfterQuote + +let load_rows f chan = + let row = ref [] in (* Current row. *) + let field = ref [] in (* Current field. *) + let state = ref StartField in (* Current state. *) + let end_of_field () = + let field_list = List.rev !field in + let field_len = List.length field_list in + let field_str = String.create field_len in + let rec loop i = function + [] -> () + | x :: xs -> + field_str.[i] <- x; + loop (i+1) xs + in + loop 0 field_list; + row := field_str :: !row; + field := []; + state := StartField + in + let empty_field () = + row := "" :: !row; + field := []; + state := StartField + in + let end_of_row () = + let row_list = List.rev !row in + f row_list; + row := []; + state := StartField + in + let rec loop () = + let c = input_char chan in + if c != '\r' then ( (* Always ignore \r characters. *) + match !state with + StartField -> (* Expecting quote or other char. *) + if c = '\"' then ( + state := InQuotedField; + field := [] + ) else if c = ',' then (* Empty field. *) + empty_field () + else if c = '\n' then ( (* Empty field, end of row. *) + empty_field (); + end_of_row () + ) else ( + state := InUnquotedField; + field := [c] + ) + | InUnquotedField -> (* Reading chars to end of field. *) + if c = ',' then (* End of field. *) + end_of_field () + else if c = '\n' then ( (* End of field and end of row. *) + end_of_field (); + end_of_row () + ) else + field := c :: !field + | InQuotedField -> (* Reading chars to end of field. *) + if c = '\"' then + state := InQuotedFieldAfterQuote + else + field := c :: !field + | InQuotedFieldAfterQuote -> + if c = '\"' then ( (* Doubled quote. *) + field := c :: !field; + state := InQuotedField + ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *) + field := '\000' :: !field; + state := InQuotedField + ) else if c = ',' then (* End of field. *) + end_of_field () + else if c = '\n' then ( (* End of field and end of row. *) + end_of_field (); + end_of_row () + ) + ); (* end of match *) + loop () + in + try + loop () + with + End_of_file -> + (* Any part left to write out? *) + (match !state with + StartField -> + if !row <> [] then + ( empty_field (); end_of_row () ) + | InUnquotedField | InQuotedFieldAfterQuote -> + end_of_field (); end_of_row () + | InQuotedField -> + raise (Bad_CSV_file "Missing end quote after quoted field.") + ) + +let load_in chan = + let csv = ref [] in + let f row = + csv := row :: !csv + in + load_rows f chan; + List.rev !csv + +let load filename = + let chan = open_in filename in + let csv = load_in chan in + close_in chan; + csv + +(* Quote a single CSV field. *) +let quote_field field = + if String.contains field ',' || + String.contains field '\"' || + String.contains field '\n' + then ( + let buffer = Buffer.create 100 in + Buffer.add_char buffer '\"'; + for i = 0 to (String.length field) - 1 do + match field.[i] with + '\"' -> Buffer.add_string buffer "\"\"" + | c -> Buffer.add_char buffer c + done; + Buffer.add_char buffer '\"'; + Buffer.contents buffer + ) + else + field + +let save_out chan csv = + List.iter (fun line -> + output_string chan (String.concat "," + (List.map quote_field line)); + output_char chan '\n') csv + +let print csv = + save_out stdout csv + +let save file csv = + let chan = open_out file in + save_out chan csv; + close_out chan diff --git a/csv.mli b/csv.mli new file mode 100644 index 0000000..fd33ac3 --- /dev/null +++ b/csv.mli @@ -0,0 +1,46 @@ +(** csv.mli - comma separated values parser + * + * $Id: csv.mli,v 1.1 2003-12-17 16:05:08 rich Exp $ + *) + +type t = string list list +(** Representation of CSV files. *) + +exception Bad_CSV_file of string +(** Badly formed CSV files throw this exception: *) + +val lines : t -> int +(** Work out the number of lines in a CSV file. *) + +val columns : t -> int +(** Work out the (maximum) number of columns in a CSV file. Note that each + line may be a different length, so this finds the one with the most + columns. *) + +val load_in : in_channel -> t +(** Load a CSV file. + * @param chan Input file stream + *) + +val load : string -> t +(** Load a CSV file. + * @param filename CSV filename. + *) + +val load_rows : (string list -> unit) -> in_channel -> unit +(** For very large CSV files which cannot be processed in memory at once, + * this function is appropriate. It parses the input one row at a time and + * calls your function once for each row. + * + * @param f Callout function. + * @param chan Input file stream. + *) + +val print : t -> unit +(** Print string list list - same as [save_out stdout] *) + +val save_out : out_channel -> t -> unit +(** Save string list list to a channel. *) + +val save : string -> t -> unit +(** Save string list list to a file. *) diff --git a/test.ml b/test.ml new file mode 100644 index 0000000..2ab02ad --- /dev/null +++ b/test.ml @@ -0,0 +1,52 @@ +(* $Id: test.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ *) + +open Printf +open Csv + +let do_testcsv filename expected = + let csv = load filename in + if csv <> expected then ( + printf "input file: %s\n" filename; + printf "Csv library produced:\n"; + print csv; + printf "Expected:\n"; + print expected; + failwith "failed" + ) + +let testcsv1 = + do_testcsv + "testcsv1.csv" + [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns." ] ] +let testcsv2 = + do_testcsv + "testcsv2.csv" + [ [ "Normal field"; "Quoted field"; "Quoted field with \"\" quotes" ] ] +let testcsv3 = + do_testcsv + "testcsv3.csv" + [ [ "" ]; + [ ""; "" ]; + [ ""; ""; "" ]; + [ ""; ""; ""; "" ]; + [ ""; ""; ""; ""; "" ] ] +let testcsv4 = + do_testcsv + "testcsv4.csv" + [] +let testcsv5 = + do_testcsv + "testcsv5.csv" + [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns."; + "a second field"; "a third field" ]; + [ "a fourth field on a new line" ] ] +let testcsv6 = + do_testcsv + "testcsv6.csv" + [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns\nand \000"; + "a second field"; "a third field" ]; + [ "a fourth field on a new line" ] ] + +;; + +print_endline "All tests succeeded." diff --git a/testcsv1.csv b/testcsv1.csv new file mode 100644 index 0000000..bb49ef6 --- /dev/null +++ b/testcsv1.csv @@ -0,0 +1,4 @@ +"This is a test +with commas,,,,, + +and carriage returns." \ No newline at end of file diff --git a/testcsv2.csv b/testcsv2.csv new file mode 100644 index 0000000..4f27203 --- /dev/null +++ b/testcsv2.csv @@ -0,0 +1 @@ +Normal field,"Quoted field","Quoted field with """" quotes" diff --git a/testcsv3.csv b/testcsv3.csv new file mode 100644 index 0000000..6ae9a38 --- /dev/null +++ b/testcsv3.csv @@ -0,0 +1,5 @@ + +, +,, +,,, +,,,, \ No newline at end of file diff --git a/testcsv4.csv b/testcsv4.csv new file mode 100644 index 0000000..e69de29 diff --git a/testcsv5.csv b/testcsv5.csv new file mode 100644 index 0000000..243f92f --- /dev/null +++ b/testcsv5.csv @@ -0,0 +1,5 @@ +"This is a test +with commas,,,,, + +and carriage returns.",a second field,a third field +a fourth field on a new line \ No newline at end of file diff --git a/testcsv6.csv b/testcsv6.csv new file mode 100644 index 0000000..c8ee1bd --- /dev/null +++ b/testcsv6.csv @@ -0,0 +1,6 @@ +"This is a test +with commas,,,,, + +and carriage returns +and "0",a second field,a third field +a fourth field on a new line \ No newline at end of file -- 1.8.3.1