From: rich <rich>
Date: Wed, 17 Dec 2003 16:05:08 +0000 (+0000)
Subject: Added CSV parsing library, and fixed handling of "0 (ASCII NUL).
X-Git-Url: http://git.annexia.org/?a=commitdiff_plain;h=677587e4828c264c36a2ddce644697c9d4b8ac06;p=ocaml-csv.git

Added CSV parsing library, and fixed handling of "0 (ASCII NUL).
---

677587e4828c264c36a2ddce644697c9d4b8ac06
diff --git a/.cvsignore b/.cvsignore
new file mode 100644
index 0000000..b6131c8
--- /dev/null
+++ b/.cvsignore
@@ -0,0 +1,6 @@
+*.cma
+*.cmi
+*.cmo
+*.cmx
+*.cmxa
+test
\ No newline at end of file
diff --git a/.depend b/.depend
new file mode 100644
index 0000000..d77da96
--- /dev/null
+++ b/.depend
@@ -0,0 +1,4 @@
+csv.cmo: csv.cmi 
+csv.cmx: csv.cmi 
+test.cmo: csv.cmi 
+test.cmx: csv.cmx 
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..09f55b8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,54 @@
+# $Id: Makefile,v 1.1 2003-12-17 16:05:08 rich Exp $
+
+OCAMLC		:= ocamlc
+OCAMLCINCS	:=
+OCAMLCFLAGS	:= -w s -g
+OCAMLCLIBS	:=
+
+OCAMLOPT	:= ocamlopt
+OCAMLOPTINCS	:= $(OCAMLCINCS)
+OCAMLOPTFLAGS	:= -w s
+OCAMLOPTLIBS	:=
+
+OBJS		:= csv.cmo
+XOBJS		:= $(OBJS:.cmo=.cmx)
+
+all: csv.cma csv.cmxa
+
+csv.cma: $(OBJS)
+	$(OCAMLC) $(OCAMLCFLAGS) -a -o $@ $^
+
+csv.cmxa: $(XOBJS)
+	$(OCAMLOPT) $(OCAMLOPTFLAGS) -a -o $@ $^
+
+test:	csv.cma test.ml
+	$(OCAMLC) csv.cma test.ml -o test
+	./test
+
+# Common rules for building OCaml objects.
+
+.mli.cmi:
+	$(OCAMLC) $(OCAMLCFLAGS) $(OCAMLCINCS) -c $<
+.ml.cmo:
+	$(OCAMLC) $(OCAMLCFLAGS) $(OCAMLCINCS) -c $<
+.ml.cmx:
+	$(OCAMLOPT) $(OCAMLOPTFLAGS) $(OCAMLOPTINCS) -c $<
+
+# Clean.
+
+clean:
+	rm -f *.cmi *.cmo *.cmx *.cma *.cmxa *.o *~ core
+
+# Dependencies.
+
+depend: .depend
+
+.depend: $(wildcard *.mli) $(wildcard *.ml)
+	rm -f .depend
+	ocamldep $^ > $@
+
+ifeq ($(wildcard .depend),.depend)
+include .depend
+endif
+
+.SUFFIXES:	.cmo .cmi .cmx .ml .mli
diff --git a/csv.ml b/csv.ml
new file mode 100644
index 0000000..b5c392b
--- /dev/null
+++ b/csv.ml
@@ -0,0 +1,195 @@
+(* csv.ml - comma separated values parser
+ *
+ * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $
+ *)
+
+(* The format of CSV files:
+ * 
+ * Each field starts with either a double quote char or some other
+ * char. For the some other char case things are simple: just read up
+ * to the next comma (,) which marks the end of the field.
+ * 
+ * In the case where a field begins with a double quote char the
+ * parsing rules are different. Any double quotes are doubled ("") and
+ * we finish reading when we reach an undoubled quote. eg: "The
+ * following is a quote: "", and that's all" is the CSV equivalent of
+ * the following literal field: The following is a quote: ", and that's
+ * all
+ * 
+ * CSV fields can also contain literal carriage return characters, if
+ * they are quoted, eg: "This field is split over lines" represents a
+ * single field containing a \n.
+ * 
+ * Excel will only use the quoting format if a field contains a double
+ * quote or comma, although there's no reason why Excel couldn't always
+ * use the quoted format.
+ * 
+ * The practical upshot of this is that you can't split a line in a CSV
+ * file just by looking at the commas. You need to parse each field
+ * separately.
+ * 
+ * How we represent CSV files:
+ * 
+ * We load in the whole CSV file at once, and store it internally as a
+ * 'string list list' type (note that each line in the CSV file can,
+ * and often will, have different lengths). We then provide simple
+ * functions to read the CSV file line-by-line, copy it out, or copy a
+ * subset of it into a matrix.
+ * 
+ * For future work: According to the Text::CSV_XS manual page, "0 is a
+ * valid encoding, within quoted fields, of the ASCII NUL character. In
+ * Unix this character could, of course, be encoded directly in the
+ * file.
+ *)
+
+type t = string list list
+
+exception Bad_CSV_file of string
+
+let lines = List.length
+
+let columns csv =
+  List.fold_left max 0 (List.map List.length csv)
+
+type state_t = StartField
+	       | InUnquotedField
+	       | InQuotedField
+	       | InQuotedFieldAfterQuote
+
+let load_rows f chan =
+  let row = ref [] in			(* Current row. *)
+  let field = ref [] in			(* Current field. *)
+  let state = ref StartField in		(* Current state. *)
+  let end_of_field () =
+    let field_list = List.rev !field in
+    let field_len = List.length field_list in
+    let field_str = String.create field_len in
+    let rec loop i = function
+	[] -> ()
+      | x :: xs ->
+	  field_str.[i] <- x;
+	  loop (i+1) xs
+    in
+    loop 0 field_list;
+    row := field_str :: !row;
+    field := [];
+    state := StartField
+  in
+  let empty_field () =
+    row := "" :: !row;
+    field := [];
+    state := StartField
+  in
+  let end_of_row () =
+    let row_list = List.rev !row in
+    f row_list;
+    row := [];
+    state := StartField
+  in
+  let rec loop () =
+    let c = input_char chan in
+    if c != '\r' then (			(* Always ignore \r characters. *)
+      match !state with
+	  StartField ->			(* Expecting quote or other char. *)
+	    if c = '\"' then (
+	      state := InQuotedField;
+	      field := []
+	    ) else if c = ',' then	(* Empty field. *)
+	      empty_field ()
+	    else if c = '\n' then (	(* Empty field, end of row. *)
+	      empty_field ();
+	      end_of_row ()
+	    ) else (
+	      state := InUnquotedField;
+	      field := [c]
+	    )
+	| InUnquotedField ->		(* Reading chars to end of field. *)
+	    if c = ',' then		(* End of field. *)
+	      end_of_field ()
+	    else if c = '\n' then (	(* End of field and end of row. *)
+	      end_of_field ();
+	      end_of_row ()
+	    ) else
+	      field := c :: !field
+	| InQuotedField ->		(* Reading chars to end of field. *)
+	    if c = '\"' then
+	      state := InQuotedFieldAfterQuote
+	    else
+	      field := c :: !field
+	| InQuotedFieldAfterQuote ->
+	    if c = '\"' then (		(* Doubled quote. *)
+	      field := c :: !field;
+	      state := InQuotedField
+	    ) else if c = '0' then (	(* Quote-0 is ASCII NUL. *)
+	      field := '\000' :: !field;
+	      state := InQuotedField
+	    ) else if c = ',' then	(* End of field. *)
+	      end_of_field ()
+	    else if c = '\n' then (	(* End of field and end of row. *)
+	      end_of_field ();
+	      end_of_row ()
+	    )
+    ); (* end of match *)
+    loop ()
+  in
+  try
+    loop ()
+  with
+      End_of_file ->
+	(* Any part left to write out? *)
+	(match !state with
+	     StartField ->
+	       if !row <> [] then
+		 ( empty_field (); end_of_row () )
+	   | InUnquotedField | InQuotedFieldAfterQuote ->
+	       end_of_field (); end_of_row ()
+	   | InQuotedField ->
+	       raise (Bad_CSV_file "Missing end quote after quoted field.")
+	)
+
+let load_in chan =
+  let csv = ref [] in
+  let f row =
+    csv := row :: !csv
+  in
+  load_rows f chan;
+  List.rev !csv
+
+let load filename =
+  let chan = open_in filename in
+  let csv = load_in chan in
+  close_in chan;
+  csv 
+
+(* Quote a single CSV field. *)
+let quote_field field =
+  if String.contains field ',' ||
+    String.contains field '\"' ||
+    String.contains field '\n'
+  then (
+    let buffer = Buffer.create 100 in
+    Buffer.add_char buffer '\"';
+    for i = 0 to (String.length field) - 1 do
+      match field.[i] with
+          '\"' -> Buffer.add_string buffer "\"\""
+	| c    -> Buffer.add_char buffer c
+    done;
+    Buffer.add_char buffer '\"';
+    Buffer.contents buffer
+   )
+  else
+    field
+
+let save_out chan csv =
+  List.iter (fun line ->
+	       output_string chan (String.concat ","
+				     (List.map quote_field line));
+	       output_char chan '\n') csv
+
+let print csv =
+  save_out stdout csv
+
+let save file csv =
+  let chan = open_out file in
+  save_out chan csv;
+  close_out chan
diff --git a/csv.mli b/csv.mli
new file mode 100644
index 0000000..fd33ac3
--- /dev/null
+++ b/csv.mli
@@ -0,0 +1,46 @@
+(** csv.mli - comma separated values parser
+  *
+  * $Id: csv.mli,v 1.1 2003-12-17 16:05:08 rich Exp $
+  *)
+
+type t = string list list
+(** Representation of CSV files. *)
+
+exception Bad_CSV_file of string
+(** Badly formed CSV files throw this exception: *)
+
+val lines : t -> int
+(** Work out the number of lines in a CSV file. *)
+
+val columns : t -> int
+(** Work out the (maximum) number of columns in a CSV file. Note that each
+   line may be a different length, so this finds the one with the most
+   columns. *)
+
+val load_in : in_channel -> t
+(** Load a CSV file.
+  * @param chan Input file stream
+  *)
+
+val load : string -> t
+(** Load a CSV file.
+  * @param filename CSV filename.
+  *)
+
+val load_rows : (string list -> unit) -> in_channel -> unit
+(** For very large CSV files which cannot be processed in memory at once,
+  * this function is appropriate. It parses the input one row at a time and
+  * calls your function once for each row.
+  *
+  * @param f Callout function.
+  * @param chan Input file stream.
+  *)
+
+val print : t -> unit
+(** Print string list list - same as [save_out stdout] *)
+
+val save_out : out_channel -> t -> unit
+(** Save string list list to a channel. *)
+
+val save : string -> t -> unit
+(** Save string list list to a file. *)
diff --git a/test.ml b/test.ml
new file mode 100644
index 0000000..2ab02ad
--- /dev/null
+++ b/test.ml
@@ -0,0 +1,52 @@
+(* $Id: test.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ *)
+
+open Printf
+open Csv
+
+let do_testcsv filename expected =
+  let csv = load filename in
+  if csv <> expected then (
+    printf "input file: %s\n" filename;
+    printf "Csv library produced:\n";
+    print csv;
+    printf "Expected:\n";
+    print expected;
+    failwith "failed"
+  )
+
+let testcsv1 =
+  do_testcsv
+    "testcsv1.csv"
+    [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns." ] ]
+let testcsv2 =
+  do_testcsv
+    "testcsv2.csv"
+    [ [ "Normal field"; "Quoted field"; "Quoted field with \"\" quotes" ] ]
+let testcsv3 =
+  do_testcsv
+    "testcsv3.csv"
+    [ [ "" ];
+      [ ""; "" ];
+      [ ""; ""; "" ];
+      [ ""; ""; ""; "" ];
+      [ ""; ""; ""; ""; "" ] ]
+let testcsv4 =
+  do_testcsv
+    "testcsv4.csv"
+    []
+let testcsv5 =
+  do_testcsv
+    "testcsv5.csv"
+    [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns.";
+	"a second field"; "a third field" ];
+      [ "a fourth field on a new line" ] ]
+let testcsv6 =
+  do_testcsv
+    "testcsv6.csv"
+    [ [ "This is a test\nwith commas,,,,,\n\nand carriage returns\nand \000";
+	"a second field"; "a third field" ];
+      [ "a fourth field on a new line" ] ]
+
+;;
+
+print_endline "All tests succeeded."
diff --git a/testcsv1.csv b/testcsv1.csv
new file mode 100644
index 0000000..bb49ef6
--- /dev/null
+++ b/testcsv1.csv
@@ -0,0 +1,4 @@
+"This is a test
+with commas,,,,,
+
+and carriage returns."
\ No newline at end of file
diff --git a/testcsv2.csv b/testcsv2.csv
new file mode 100644
index 0000000..4f27203
--- /dev/null
+++ b/testcsv2.csv
@@ -0,0 +1 @@
+Normal field,"Quoted field","Quoted field with """" quotes"
diff --git a/testcsv3.csv b/testcsv3.csv
new file mode 100644
index 0000000..6ae9a38
--- /dev/null
+++ b/testcsv3.csv
@@ -0,0 +1,5 @@
+
+,
+,,
+,,,
+,,,,
\ No newline at end of file
diff --git a/testcsv4.csv b/testcsv4.csv
new file mode 100644
index 0000000..e69de29
diff --git a/testcsv5.csv b/testcsv5.csv
new file mode 100644
index 0000000..243f92f
--- /dev/null
+++ b/testcsv5.csv
@@ -0,0 +1,5 @@
+"This is a test
+with commas,,,,,
+
+and carriage returns.",a second field,a third field
+a fourth field on a new line
\ No newline at end of file
diff --git a/testcsv6.csv b/testcsv6.csv
new file mode 100644
index 0000000..c8ee1bd
--- /dev/null
+++ b/testcsv6.csv
@@ -0,0 +1,6 @@
+"This is a test
+with commas,,,,,
+
+and carriage returns
+and "0",a second field,a third field
+a fourth field on a new line
\ No newline at end of file