From: rich Date: Wed, 17 Dec 2003 16:05:08 +0000 (+0000) Subject: Added CSV parsing library, and fixed handling of "0 (ASCII NUL). X-Git-Url: http://git.annexia.org/?a=commitdiff_plain;h=677587e4828c264c36a2ddce644697c9d4b8ac06;p=ocaml-csv.git Added CSV parsing library, and fixed handling of "0 (ASCII NUL). --- 677587e4828c264c36a2ddce644697c9d4b8ac06 diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 0000000..b6131c8 --- /dev/null +++ b/.cvsignore @@ -0,0 +1,6 @@ +*.cma +*.cmi +*.cmo +*.cmx +*.cmxa +test \ No newline at end of file diff --git a/.depend b/.depend new file mode 100644 index 0000000..d77da96 --- /dev/null +++ b/.depend @@ -0,0 +1,4 @@ +csv.cmo: csv.cmi +csv.cmx: csv.cmi +test.cmo: csv.cmi +test.cmx: csv.cmx diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..09f55b8 --- /dev/null +++ b/Makefile @@ -0,0 +1,54 @@ +# $Id: Makefile,v 1.1 2003-12-17 16:05:08 rich Exp $ + +OCAMLC := ocamlc +OCAMLCINCS := +OCAMLCFLAGS := -w s -g +OCAMLCLIBS := + +OCAMLOPT := ocamlopt +OCAMLOPTINCS := $(OCAMLCINCS) +OCAMLOPTFLAGS := -w s +OCAMLOPTLIBS := + +OBJS := csv.cmo +XOBJS := $(OBJS:.cmo=.cmx) + +all: csv.cma csv.cmxa + +csv.cma: $(OBJS) + $(OCAMLC) $(OCAMLCFLAGS) -a -o $@ $^ + +csv.cmxa: $(XOBJS) + $(OCAMLOPT) $(OCAMLOPTFLAGS) -a -o $@ $^ + +test: csv.cma test.ml + $(OCAMLC) csv.cma test.ml -o test + ./test + +# Common rules for building OCaml objects. + +.mli.cmi: + $(OCAMLC) $(OCAMLCFLAGS) $(OCAMLCINCS) -c $< +.ml.cmo: + $(OCAMLC) $(OCAMLCFLAGS) $(OCAMLCINCS) -c $< +.ml.cmx: + $(OCAMLOPT) $(OCAMLOPTFLAGS) $(OCAMLOPTINCS) -c $< + +# Clean. + +clean: + rm -f *.cmi *.cmo *.cmx *.cma *.cmxa *.o *~ core + +# Dependencies. + +depend: .depend + +.depend: $(wildcard *.mli) $(wildcard *.ml) + rm -f .depend + ocamldep $^ > $@ + +ifeq ($(wildcard .depend),.depend) +include .depend +endif + +.SUFFIXES: .cmo .cmi .cmx .ml .mli diff --git a/csv.ml b/csv.ml new file mode 100644 index 0000000..b5c392b --- /dev/null +++ b/csv.ml @@ -0,0 +1,195 @@ +(* csv.ml - comma separated values parser + * + * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ + *) + +(* The format of CSV files: + * + * Each field starts with either a double quote char or some other + * char. For the some other char case things are simple: just read up + * to the next comma (,) which marks the end of the field. + * + * In the case where a field begins with a double quote char the + * parsing rules are different. Any double quotes are doubled ("") and + * we finish reading when we reach an undoubled quote. eg: "The + * following is a quote: "", and that's all" is the CSV equivalent of + * the following literal field: The following is a quote: ", and that's + * all + * + * CSV fields can also contain literal carriage return characters, if + * they are quoted, eg: "This field is split over lines" represents a + * single field containing a \n. + * + * Excel will only use the quoting format if a field contains a double + * quote or comma, although there's no reason why Excel couldn't always + * use the quoted format. + * + * The practical upshot of this is that you can't split a line in a CSV + * file just by looking at the commas. You need to parse each field + * separately. + * + * How we represent CSV files: + * + * We load in the whole CSV file at once, and store it internally as a + * 'string list list' type (note that each line in the CSV file can, + * and often will, have different lengths). We then provide simple + * functions to read the CSV file line-by-line, copy it out, or copy a + * subset of it into a matrix. + * + * For future work: According to the Text::CSV_XS manual page, "0 is a + * valid encoding, within quoted fields, of the ASCII NUL character. In + * Unix this character could, of course, be encoded directly in the + * file. + *) + +type t = string list list + +exception Bad_CSV_file of string + +let lines = List.length + +let columns csv = + List.fold_left max 0 (List.map List.length csv) + +type state_t = StartField + | InUnquotedField + | InQuotedField + | InQuotedFieldAfterQuote + +let load_rows f chan = + let row = ref [] in (* Current row. *) + let field = ref [] in (* Current field. *) + let state = ref StartField in (* Current state. *) + let end_of_field () = + let field_list = List.rev !field in + let field_len = List.length field_list in + let field_str = String.create field_len in + let rec loop i = function + [] -> () + | x :: xs -> + field_str.[i] <- x; + loop (i+1) xs + in + loop 0 field_list; + row := field_str :: !row; + field := []; + state := StartField + in + let empty_field () = + row := "" :: !row; + field := []; + state := StartField + in + let end_of_row () = + let row_list = List.rev !row in + f row_list; + row := []; + state := StartField + in + let rec loop () = + let c = input_char chan in + if c != '\r' then ( (* Always ignore \r characters. *) + match !state with + StartField -> (* Expecting quote or other char. *) + if c = '\"' then ( + state := InQuotedField; + field := [] + ) else if c = ',' then (* Empty field. *) + empty_field () + else if c = '\n' then ( (* Empty field, end of row. *) + empty_field (); + end_of_row () + ) else ( + state := InUnquotedField; + field := [c] + ) + | InUnquotedField -> (* Reading chars to end of field. *) + if c = ',' then (* End of field. *) + end_of_field () + else if c = '\n' then ( (* End of field and end of row. *) + end_of_field (); + end_of_row () + ) else + field := c :: !field + | InQuotedField -> (* Reading chars to end of field. *) + if c = '\"' then + state := InQuotedFieldAfterQuote + else + field := c :: !field + | InQuotedFieldAfterQuote -> + if c = '\"' then ( (* Doubled quote. *) + field := c :: !field; + state := InQuotedField + ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *) + field := '\000' :: !field; + state := InQuotedField + ) else if c = ',' then (* End of field. *) + end_of_field () + else if c = '\n' then ( (* End of field and end of row. *) + end_of_field (); + end_of_row () + ) + ); (* end of match *) + loop () + in + try + loop () + with + End_of_file -> + (* Any part left to write out? *) + (match !state with + StartField -> + if !row <> [] then + ( empty_field (); end_of_row () ) + | InUnquotedField | InQuotedFieldAfterQuote -> + end_of_field (); end_of_row () + | InQuotedField -> + raise (Bad_CSV_file "Missing end quote after quoted field.") + ) + +let load_in chan = + let csv = ref [] in + let f row = + csv := row :: !csv + in + load_rows f chan; + List.rev !csv + +let load filename = + let chan = open_in filename in + let csv = load_in chan in + close_in chan; + csv + +(* Quote a single CSV field. *) +let quote_field field = + if String.contains field ',' || + String.contains field '\"' || + String.contains field '\n' + then ( + let buffer = Buffer.create 100 in + Buffer.add_char buffer '\"'; + for i = 0 to (String.length field) - 1 do + match field.[i] with + '\"' -> Buffer.add_string buffer "\"\"" + | c -> Buffer.add_char buffer c + done; + Buffer.add_char buffer '\"'; + Buffer.contents buffer + ) + else + field + +let save_out chan csv = + List.iter (fun line -> + output_string chan (String.concat "," + (List.map quote_field line)); + output_char chan '\n') csv + +let print csv = + save_out stdout csv + +let save file csv = + let chan = open_out file in + save_out chan csv; + close_out chan diff --git a/csv.mli b/csv.mli new file mode 100644 index 0000000..fd33ac3 --- /dev/null +++ b/csv.mli @@ -0,0 +1,46 @@ +(** csv.mli - comma separated values parser + * + * $Id: csv.mli,v 1.1 2003-12-17 16:05:08 rich Exp $ + *) + +type t = string list list +(** Representation of CSV files. *) + +exception Bad_CSV_file of string +(** Badly formed CSV files throw this exception: *) + +val lines : t -> int +(** Work out the number of lines in a CSV file. *) + +val columns : t -> int +(** Work out the (maximum) number of columns in a CSV file. Note that each + line may be a different length, so this finds the one with the most + columns. *) + +val load_in : in_channel -> t +(** Load a CSV file. + * @param chan Input file stream + *) + +val load : string -> t +(** Load a CSV file. + * @param filename CSV filename. + *) + +val load_rows : (string list -> unit) -> in_channel -> unit +(** For very large CSV files which cannot be processed in memory at once, + * this function is appropriate. It parses the input one row at a time and + * calls your function once for each row. + * + * @param f Callout function. + * @param chan Input file stream. + *) + +val print : t -> unit +(** Print string list list - same as [save_out stdout] *) + +val save_out : out_channel -> t -> unit +(** Save string list list to a channel. *) + +val save : string -> t -> unit +(** Save string list list to a file. *) diff --git a/test.ml b/test.ml new file mode 100644 index 0000000..2ab02ad --- /dev/null +++ b/test.ml @@ -0,0 +1,52 @@ +(* $Id: test.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ *) + +open Printf +open Csv + +let do_testcsv filename expected = + let csv = load filename in + if csv <> expected then ( + printf "input file: %s\n" filename; + printf "Csv library produced:\n"; + print csv; + printf "Expected:\n"; + print expected; + failwith "failed" + ) + +let testcsv1 = + do_testcsv + "testcsv1.csv" + [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns." ] ] +let testcsv2 = + do_testcsv + "testcsv2.csv" + [ [ "Normal field"; "Quoted field"; "Quoted field with \"\" quotes" ] ] +let testcsv3 = + do_testcsv + "testcsv3.csv" + [ [ "" ]; + [ ""; "" ]; + [ ""; ""; "" ]; + [ ""; ""; ""; "" ]; + [ ""; ""; ""; ""; "" ] ] +let testcsv4 = + do_testcsv + "testcsv4.csv" + [] +let testcsv5 = + do_testcsv + "testcsv5.csv" + [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns."; + "a second field"; "a third field" ]; + [ "a fourth field on a new line" ] ] +let testcsv6 = + do_testcsv + "testcsv6.csv" + [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns\nand \000"; + "a second field"; "a third field" ]; + [ "a fourth field on a new line" ] ] + +;; + +print_endline "All tests succeeded." diff --git a/testcsv1.csv b/testcsv1.csv new file mode 100644 index 0000000..bb49ef6 --- /dev/null +++ b/testcsv1.csv @@ -0,0 +1,4 @@ +"This is a test +with commas,,,,, + +and carriage returns." \ No newline at end of file diff --git a/testcsv2.csv b/testcsv2.csv new file mode 100644 index 0000000..4f27203 --- /dev/null +++ b/testcsv2.csv @@ -0,0 +1 @@ +Normal field,"Quoted field","Quoted field with """" quotes" diff --git a/testcsv3.csv b/testcsv3.csv new file mode 100644 index 0000000..6ae9a38 --- /dev/null +++ b/testcsv3.csv @@ -0,0 +1,5 @@ + +, +,, +,,, +,,,, \ No newline at end of file diff --git a/testcsv4.csv b/testcsv4.csv new file mode 100644 index 0000000..e69de29 diff --git a/testcsv5.csv b/testcsv5.csv new file mode 100644 index 0000000..243f92f --- /dev/null +++ b/testcsv5.csv @@ -0,0 +1,5 @@ +"This is a test +with commas,,,,, + +and carriage returns.",a second field,a third field +a fourth field on a new line \ No newline at end of file diff --git a/testcsv6.csv b/testcsv6.csv new file mode 100644 index 0000000..c8ee1bd --- /dev/null +++ b/testcsv6.csv @@ -0,0 +1,6 @@ +"This is a test +with commas,,,,, + +and carriage returns +and "0",a second field,a third field +a fourth field on a new line \ No newline at end of file