X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;f=csv.ml;h=82d8d02f218ca5df9f07b49e4f01a4b4d6542298;hb=3f5ce21cf955364c5b665ac98dab8545b3f1c6b3;hp=b5c392b79f449c6b838ff5185acee78622c2c3c9;hpb=677587e4828c264c36a2ddce644697c9d4b8ac06;p=ocaml-csv.git diff --git a/csv.ml b/csv.ml index b5c392b..82d8d02 100644 --- a/csv.ml +++ b/csv.ml @@ -1,6 +1,6 @@ (* csv.ml - comma separated values parser * - * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ + * $Id: csv.ml,v 1.10 2006-04-21 12:44:42 rich Exp $ *) (* The format of CSV files: @@ -15,9 +15,12 @@ * following is a quote: "", and that's all" is the CSV equivalent of * the following literal field: The following is a quote: ", and that's * all + * + * "0 is the quoted form of ASCII NUL. * * CSV fields can also contain literal carriage return characters, if - * they are quoted, eg: "This field is split over lines" represents a + * they are quoted, eg: "This field + * is split over lines" represents a * single field containing a \n. * * Excel will only use the quoting format if a field contains a double @@ -35,17 +38,28 @@ * and often will, have different lengths). We then provide simple * functions to read the CSV file line-by-line, copy it out, or copy a * subset of it into a matrix. - * - * For future work: According to the Text::CSV_XS manual page, "0 is a - * valid encoding, within quoted fields, of the ASCII NUL character. In - * Unix this character could, of course, be encoded directly in the - * file. *) +open Printf + type t = string list list exception Bad_CSV_file of string +let rec dropwhile f = function + | [] -> [] + | x :: xs when f x -> dropwhile f xs + | xs -> xs + +(* from extlib: *) +let rec drop n = function + | _ :: l when n > 0 -> drop (n-1) l + | l -> l + +let rec take n = function + | x :: xs when n > 0 -> x :: take (pred n) xs + | _ -> [] + let lines = List.length let columns csv = @@ -56,7 +70,7 @@ type state_t = StartField | InQuotedField | InQuotedFieldAfterQuote -let load_rows f chan = +let load_rows ?(separator = ',') f chan = let row = ref [] in (* Current row. *) let field = ref [] in (* Current field. *) let state = ref StartField in (* Current state. *) @@ -91,10 +105,10 @@ let load_rows f chan = if c != '\r' then ( (* Always ignore \r characters. *) match !state with StartField -> (* Expecting quote or other char. *) - if c = '\"' then ( + if c = '"' then ( state := InQuotedField; field := [] - ) else if c = ',' then (* Empty field. *) + ) else if c = separator then (* Empty field. *) empty_field () else if c = '\n' then ( (* Empty field, end of row. *) empty_field (); @@ -104,7 +118,7 @@ let load_rows f chan = field := [c] ) | InUnquotedField -> (* Reading chars to end of field. *) - if c = ',' then (* End of field. *) + if c = separator then (* End of field. *) end_of_field () else if c = '\n' then ( (* End of field and end of row. *) end_of_field (); @@ -112,22 +126,25 @@ let load_rows f chan = ) else field := c :: !field | InQuotedField -> (* Reading chars to end of field. *) - if c = '\"' then + if c = '"' then state := InQuotedFieldAfterQuote else field := c :: !field | InQuotedFieldAfterQuote -> - if c = '\"' then ( (* Doubled quote. *) + if c = '"' then ( (* Doubled quote. *) field := c :: !field; state := InQuotedField ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *) field := '\000' :: !field; state := InQuotedField - ) else if c = ',' then (* End of field. *) + ) else if c = separator then (* End of field. *) end_of_field () else if c = '\n' then ( (* End of field and end of row. *) end_of_field (); end_of_row () + ) else ( (* Bad single quote in field. *) + field := c :: '"' :: !field; + state := InQuotedField ) ); (* end of match *) loop () @@ -147,49 +164,253 @@ let load_rows f chan = raise (Bad_CSV_file "Missing end quote after quoted field.") ) -let load_in chan = +let load_in ?separator chan = let csv = ref [] in let f row = csv := row :: !csv in - load_rows f chan; + load_rows ?separator f chan; List.rev !csv -let load filename = +let load ?separator filename = let chan = open_in filename in - let csv = load_in chan in + let csv = load_in ?separator chan in close_in chan; csv -(* Quote a single CSV field. *) -let quote_field field = - if String.contains field ',' || - String.contains field '\"' || - String.contains field '\n' - then ( - let buffer = Buffer.create 100 in - Buffer.add_char buffer '\"'; - for i = 0 to (String.length field) - 1 do - match field.[i] with - '\"' -> Buffer.add_string buffer "\"\"" - | c -> Buffer.add_char buffer c - done; - Buffer.add_char buffer '\"'; - Buffer.contents buffer - ) - else - field - -let save_out chan csv = +let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv = + let rec empty_row = function + | [] -> true + | x :: xs when x <> "" -> false + | x :: xs -> empty_row xs + in + let csv = if top then dropwhile empty_row csv else csv in + let csv = + if right then + List.map (fun row -> + let row = List.rev row in + let row = dropwhile ((=) "") row in + let row = List.rev row in + row) csv + else csv in + let csv = + if bottom then ( + let csv = List.rev csv in + let csv = dropwhile empty_row csv in + let csv = List.rev csv in + csv + ) else csv in + + let empty_left_cell = + function [] -> true | x :: xs when x = "" -> true | _ -> false in + let empty_left_col = + List.fold_left (fun a row -> a && empty_left_cell row) true in + let remove_left_col = + List.map (function [] -> [] | x :: xs -> xs) in + let rec loop csv = + if empty_left_col csv then ( + let csv = remove_left_col csv in + loop csv + ) else csv + in + + let csv = if left then loop csv else csv in + + csv + +let square csv = + let columns = columns csv in + List.map ( + fun row -> + let n = List.length row in + let row = List.rev row in + let rec loop acc = function + | 0 -> acc + | i -> "" :: loop acc (i-1) + in + let row = loop row (columns - n) in + List.rev row + ) csv + +let is_square csv = + let columns = columns csv in + List.for_all (fun row -> List.length row = columns) csv + +let rec set_columns cols = function + | [] -> [] + | r :: rs -> + let rec loop i cells = + if i < cols then ( + match cells with + | [] -> "" :: loop (succ i) [] + | c :: cs -> c :: loop (succ i) cs + ) + else [] + in + loop 0 r :: set_columns cols rs + +let rec set_rows rows csv = + if rows > 0 then ( + match csv with + | [] -> [] :: set_rows (pred rows) [] + | r :: rs -> r :: set_rows (pred rows) rs + ) + else [] + +let set_size rows cols csv = + set_columns cols (set_rows rows csv) + +let sub r c rows cols csv = + let csv = drop r csv in + let csv = List.map (drop c) csv in + let csv = set_rows rows csv in + let csv = set_columns cols csv in + csv + +(* Compare two rows for semantic equality - ignoring any blank cells + * at the end of each row. + *) +let rec compare_row (row1 : string list) row2 = + match row1, row2 with + | [], [] -> 0 + | x :: xs, y :: ys -> + let c = compare x y in + if c <> 0 then c else compare_row xs ys + | "" :: xs , [] -> + compare_row xs [] + | x :: xs, [] -> + 1 + | [], "" :: ys -> + compare_row [] ys + | [], y :: ys -> + -1 + +(* Semantic equality for CSV files. *) +let rec compare (csv1 : t) csv2 = + match csv1, csv2 with + | [], [] -> 0 + | x :: xs, y :: ys -> + let c = compare_row x y in + if c <> 0 then c else compare xs ys + | x :: xs, [] -> + let c = compare_row x [] in + if c <> 0 then c else compare xs [] + | [], y :: ys -> + let c = compare_row [] y in + if c <> 0 then c else compare [] ys + +let to_array csv = + Array.of_list (List.map Array.of_list csv) + +let of_array csv = + List.map Array.to_list (Array.to_list csv) + +let associate header data = + let nr_cols = List.length header in + let rec trunc = function + | 0, _ -> [] + | n, [] -> "" :: trunc (n-1, []) + | n, (x :: xs) -> x :: trunc (n-1, xs) + in + List.map ( + fun row -> + let row = trunc (nr_cols, row) in + List.combine header row + ) data + +let save_out ?(separator = ',') chan csv = + (* Quote a single CSV field. *) + let quote_field field = + if String.contains field separator || + String.contains field '\"' || + String.contains field '\n' + then ( + let buffer = Buffer.create 100 in + Buffer.add_char buffer '\"'; + for i = 0 to (String.length field) - 1 do + match field.[i] with + '\"' -> Buffer.add_string buffer "\"\"" + | c -> Buffer.add_char buffer c + done; + Buffer.add_char buffer '\"'; + Buffer.contents buffer + ) + else + field + in + + let separator = String.make 1 separator in List.iter (fun line -> - output_string chan (String.concat "," + output_string chan (String.concat separator (List.map quote_field line)); output_char chan '\n') csv -let print csv = - save_out stdout csv +let print ?separator csv = + save_out ?separator stdout csv; flush stdout -let save file csv = +let save ?separator file csv = let chan = open_out file in - save_out chan csv; + save_out ?separator chan csv; close_out chan + +let save_out_readable chan csv = + (* Escape all the strings in the CSV file first. *) + (* XXX Why are we doing this? I commented it out anyway. + let csv = List.map (List.map String.escaped) csv in + *) + + (* Find the width of each column. *) + let widths = + (* Don't consider rows with only a single element - typically + * long titles. + *) + let csv = List.filter (function [_] -> false | _ -> true) csv in + + (* Square the CSV file - makes the next step simpler to implement. *) + let csv = square csv in + + match csv with + | [] -> [] + | row1 :: rest -> + let lengths_row1 = List.map String.length row1 in + let lengths_rest = List.map (List.map String.length) rest in + let max2rows r1 r2 = + let rp = + try List.combine r1 r2 + with + Invalid_argument "List.combine" -> + failwith "Csv.save_out_readable: internal error: length r1 = %d, length r2 = %d" (List.length r1) (List.length r2) in + List.map (fun ((a : int), (b : int)) -> max a b) rp + in + List.fold_left max2rows lengths_row1 lengths_rest in + + (* Print out each cell at the correct width. *) + let rec repeat f = function + | 0 -> () + | i -> f (); repeat f (i-1) + in + List.iter ( + function + | [cell] -> (* Single column. *) + output_string chan cell; + output_char chan '\n' + | row -> (* Other. *) + (* Pair up each cell with its max width. *) + let row = + let rec loop = function + | ([], _) -> [] + | (_, []) -> failwith "Csv.save_out_readable: internal error" + | (cell :: cells, width :: widths) -> + (cell, width) :: loop (cells, widths) + in + loop (row, widths) in + List.iter ( + fun (cell, width) -> + output_string chan cell; + let n = String.length cell in + repeat (fun () -> output_char chan ' ') (width - n + 1) + ) row; + output_char chan '\n' + ) csv + +let print_readable = save_out_readable stdout