From 70ed84c05dfe890425592cc207f7a866f0afb04e Mon Sep 17 00:00:00 2001 From: rich Date: Wed, 27 Sep 2006 14:05:07 +0000 Subject: [PATCH] Test whether this code can be applied to Weblogs. --- .cvsignore | 3 +- .depend | 2 ++ MANIFEST | 1 + Makefile | 26 +++++++++++++-- Makefile.config | 8 +++-- ancient_c.c | 84 +++++++++++++++++++++++++--------------------- test_ancient_weblogs.ml | 88 +++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 168 insertions(+), 44 deletions(-) create mode 100644 test_ancient_weblogs.ml diff --git a/.cvsignore b/.cvsignore index f4687eb..d3c78b6 100644 --- a/.cvsignore +++ b/.cvsignore @@ -6,4 +6,5 @@ *.opt test_ancient.out1 test_ancient.out2 -META \ No newline at end of file +META +ancient-*.tar.gz \ No newline at end of file diff --git a/.depend b/.depend index 648baca..47db211 100644 --- a/.depend +++ b/.depend @@ -2,3 +2,5 @@ ancient.cmo: ancient.cmi ancient.cmx: ancient.cmi test_ancient.cmo: ancient.cmi test_ancient.cmx: ancient.cmx +test_ancient_weblogs.cmo: ancient.cmi +test_ancient_weblogs.cmx: ancient.cmx diff --git a/MANIFEST b/MANIFEST index 64736e0..db915e9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -8,3 +8,4 @@ Makefile.config MANIFEST META.in test_ancient.ml +test_ancient_weblogs.ml diff --git a/Makefile b/Makefile index 491bb0e..9115302 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Mark objects as 'ancient' so they are taken out of the OCaml heap. -# $Id: Makefile,v 1.3 2006-09-27 12:10:18 rich Exp $ +# $Id: Makefile,v 1.4 2006-09-27 14:05:07 rich Exp $ include Makefile.config @@ -16,7 +16,21 @@ OCAMLOPTLIBS := OCAMLDOCFLAGS := -html -stars -sort $(OCAMLCPACKAGES) -all: ancient.cma ancient.cmxa test_ancient.opt META +ifeq ($(TEST_WEBLOGS),1) +# For testing with large amount of weblogs data. +OCAMLCPACKAGES := -package calendar,pcre,extlib -I ../../freeware/weblogs +OCAMLCLIBS := -linkpkg weblogs.cma +OCAMLOPTPACKAGES := $(OCAMLCPACKAGES) +OCAMLOPTLIBS := -linkpkg weblogs.cmxa +endif + +TARGETS := ancient.cma ancient.cmxa META test_ancient.opt + +ifeq ($(TEST_WEBLOGS),1) +TARGETS += test_ancient_weblogs.opt +endif + +all: $(TARGETS) ancient.cma: ancient.cmo ancient_c.o ocamlmklib -o ancient $^ @@ -26,7 +40,13 @@ ancient.cmxa: ancient.cmx ancient_c.o test_ancient.opt: ancient.cmxa test_ancient.cmx LIBRARY_PATH=.:$$LIBRARY_PATH \ - ocamlfind ocamlopt $(OCAMLOPTFLAGS) $(OCAMLOPTLIBS) -o $@ $^ + ocamlfind ocamlopt $(OCAMLOPTFLAGS) $(OCAMLOPTPACKAGES) $(OCAMLOPTLIBS) -o $@ $^ + +ifeq ($(TEST_WEBLOGS),1) +test_ancient_weblogs.opt: ancient.cmxa test_ancient_weblogs.cmx + LIBRARY_PATH=.:$$LIBRARY_PATH \ + ocamlfind ocamlopt $(OCAMLOPTFLAGS) $(OCAMLOPTPACKAGES) $(OCAMLOPTLIBS) -o $@ $^ +endif # Common rules for building OCaml objects. diff --git a/Makefile.config b/Makefile.config index 0cad3c4..10ba562 100644 --- a/Makefile.config +++ b/Makefile.config @@ -1,5 +1,9 @@ # Mark objects as 'ancient' so they are taken out of the OCaml heap. -# $Id: Makefile.config,v 1.1 2006-09-27 12:09:02 rich Exp $ +# $Id: Makefile.config,v 1.2 2006-09-27 14:05:07 rich Exp $ PACKAGE := ancient -VERSION := 0.0.1 +VERSION := 0.0.2 + +ifeq ($(shell hostname),oirase) +TEST_WEBLOGS := 1 +endif \ No newline at end of file diff --git a/ancient_c.c b/ancient_c.c index 8a73c0c..b39e9d9 100644 --- a/ancient_c.c +++ b/ancient_c.c @@ -1,5 +1,5 @@ /* Mark objects as 'ancient' so they are taken out of the OCaml heap. - * $Id: ancient_c.c,v 1.1 2006-09-27 12:07:07 rich Exp $ + * $Id: ancient_c.c,v 1.2 2006-09-27 14:05:07 rich Exp $ */ #include @@ -14,6 +14,13 @@ // From byterun/misc.h: typedef char * addr; +// From byterun/minor_gc.c: +CAMLextern char *caml_young_start; +CAMLextern char *caml_young_end; +#define Is_young(val) \ + (assert (Is_block (val)), \ + (addr)(val) < (addr)caml_young_end && (addr)(val) > (addr)caml_young_start) + // From byterun/major_gc.h: #ifdef __alpha typedef int page_table_entry; @@ -99,12 +106,16 @@ static header_t visited = (unsigned long) -1; // object's header to a special 'visited' value. However since these // are objects in the Caml heap we have to restore the original // headers at the end, which is the purpose of the [restore] area. +// 4. We use realloc to allocate the memory for the copy, but because +// the memory can move around, we cannot store absolute pointers. +// Instead we store offsets and fix them up later. This is the +// purpose of the [fixups] area. // -// XXX Recursive function will probably fall over once we apply it to -// large, deeply recursive structures. Should be replaced with something -// iterative. +// XXX Large, deeply recursive structures cause a stack overflow. +// Temporary solution: 'ulimit -s unlimited'. This function should +// be replaced with something iterative. static size_t -mark (value obj, area *ptr, area *restore) +mark (value obj, area *ptr, area *restore, area *fixups) { char *header = Hp_val (obj); assert (Wosize_hp (header) > 0); // Always true? (XXX) @@ -112,7 +123,7 @@ mark (value obj, area *ptr, area *restore) // We can't handle out-of-heap objects. // XXX Since someone might try to mark an ancient object, they // might get this error, so we should try to do better here. - assert (Is_in_heap (obj)); + assert (Is_young (obj) || Is_in_heap (obj)); // If we've already visited this object, just return its offset // in the out-of-heap memory. @@ -136,8 +147,9 @@ mark (value obj, area *ptr, area *restore) for (i = 0; i < nr_words; ++i) { value field = Field (obj, i); - if (Is_block (field)) { - size_t field_offset = mark (field, ptr, restore); + if (Is_block (field) && + (Is_young (field) || Is_in_heap (field))) { + size_t field_offset = mark (field, ptr, restore, fixups); if (field_offset == -1) return -1; // Propagate out of memory errors. // Since the recursive call to mark above can reallocate the @@ -148,7 +160,10 @@ mark (value obj, area *ptr, area *restore) // Don't store absolute pointers yet because realloc will // move the memory around. Store a fake pointer instead. // We'll fix up these fake pointers afterwards. - Field (obj_copy, i) = (field_offset + sizeof (header_t)) << 2; + Field (obj_copy, i) = field_offset + sizeof (header_t); + + size_t fixup = (void *)&Field(obj_copy, i) - ptr->ptr; + area_append (fixups, &fixup, sizeof fixup); } } } @@ -201,6 +216,21 @@ do_restore (area *ptr, area *restore) } } +// Fixup fake pointers. +static void +do_fixups (area *ptr, area *fixups) +{ + long i; + + for (i = 0; i < fixups->n; i += sizeof (size_t)) + { + size_t fixup = *(size_t *)(fixups->ptr + i); + size_t offset = *(size_t *)(ptr->ptr + fixup); + void *real_ptr = ptr->ptr + offset; + *(value *)(ptr->ptr + fixup) = (value) real_ptr; + } +} + CAMLprim value ancient_mark (value obj) { @@ -211,10 +241,13 @@ ancient_mark (value obj) area_init (&ptr); area restore; // Headers to be fixed up after. area_init (&restore); + area fixups; // List of fake pointers to be fixed up. + area_init (&fixups); - if (mark (obj, &ptr, &restore) == -1) { + if (mark (obj, &ptr, &restore, &fixups) == -1) { // Ran out of memory. Recover and throw an exception. do_restore (&ptr, &restore); + area_free (&fixups); area_free (&restore); area_free (&ptr); caml_failwith ("out of memory"); @@ -227,33 +260,8 @@ ancient_mark (value obj) // Update all fake pointers in the out of heap area to make them real // pointers. - size_t i; - for (i = 0; i < ptr.n; ) - { - // Out of heap area is: header, fields, header, fields, ... - // The header of each object tells us how many fields it has. - char *header = ptr.ptr + i; - size_t bytes = Bhsize_hp (header); - value obj = Val_hp (header); - - int can_scan = Tag_val (obj) < No_scan_tag; - if (can_scan) { - mlsize_t nr_words = Wosize_hp (header); - mlsize_t j; - - for (j = 0; j < nr_words; ++j) { - value field = Field (obj, j); - - if (Is_block (field)) { - size_t field_offset = field >> 2; - void *field_ptr = ptr.ptr + field_offset; - Field (obj, j) = (value) field_ptr; - } - } - } - - i += bytes; // Skip to next object. - } + do_fixups (&ptr, &fixups); + area_free (&fixups); // Replace obj with a proxy. proxy = caml_alloc (1, Abstract_tag); @@ -285,7 +293,7 @@ ancient_delete (value obj) if (Is_long (v)) caml_invalid_argument ("deleted"); // Otherwise v is a pointer to the out of heap malloc'd object. - assert (!Is_in_heap (v)); + assert (!Is_young (v) && !Is_in_heap (v)); free ((void *) v); // Replace the proxy (a pointer) with an int 0 so we know it's diff --git a/test_ancient_weblogs.ml b/test_ancient_weblogs.ml new file mode 100644 index 0000000..946075b --- /dev/null +++ b/test_ancient_weblogs.ml @@ -0,0 +1,88 @@ +(* Load in large weblogs and see if they can still be used. + * $Id: test_ancient_weblogs.ml,v 1.1 2006-09-27 14:05:07 rich Exp $ + *) + +open Printf + +open ExtList + +let gc_stats = true (* If true, print GC stats before processing each day. *) + +let (//) = Filename.concat + +let rec range a b = + if a > b then [] + else a :: range (succ a) b + +(* Cartesian join of two lists. *) +let cartesian xs ys = + List.flatten ( + List.map ( + fun x -> + List.map ( + fun y -> x, y + ) ys + ) xs + ) + +let file_readable filename = + try Unix.access filename [Unix.R_OK]; true + with Unix.Unix_error _ -> false + +(* Suppress warning messages. *) +let () = Weblogs.quiet := true + +let gc_compact () = + eprintf "compacting ... %!"; + Gc.compact (); + if gc_stats then ( + let stat = Gc.stat () in + let live_words = stat.Gc.live_words in + eprintf "live words = %d (%d MB)\n%!" + live_words (live_words * 8 / 1024 / 1024) + ) + +(* Find the list of files. Some which should exist don't, so + * warnings about those so we can chase up. + *) +let files = + let dir = "/home/rich/oversized-logfiles/perrys" in + let drivers = + [ "burns"; "gronholm"; "rohrl"; "sainz"; "solberg"; "vatanen" ] in + let dates = range 1 31 in + let dates = List.map (fun day -> sprintf "200608%02d" day) dates in + let files = cartesian drivers dates in + let files = + List.map (fun (driver, date) -> + sprintf "%s-perrys-access.log.%s.gz" driver date) files in + let files = + List.filter_map ( + fun filename -> + let path = dir // filename in + if not (file_readable path) then ( + prerr_endline ("warning: " ^ filename ^ " not found - ignored"); + None + ) else ( + Some path + ) + ) files in + + eprintf "number of files = %d\n%!" (List.length files); + + files + +(* Load each file into memory and make it ancient. *) +let () = + let files = + List.map ( + fun filename -> + eprintf "Importing file %s\n%!" filename; + let rows = + let rows = Weblogs.import_file filename in + Ancient.mark rows in + gc_compact (); + rows + ) files in + + ignore (files) + -- 1.8.3.1