# Mark objects as 'ancient' so they are taken out of the OCaml heap.
-# $Id: Makefile,v 1.3 2006-09-27 12:10:18 rich Exp $
+# $Id: Makefile,v 1.4 2006-09-27 14:05:07 rich Exp $
include Makefile.config
OCAMLDOCFLAGS := -html -stars -sort $(OCAMLCPACKAGES)
-all: ancient.cma ancient.cmxa test_ancient.opt META
+ifeq ($(TEST_WEBLOGS),1)
+# For testing with large amount of weblogs data.
+OCAMLCPACKAGES := -package calendar,pcre,extlib -I ../../freeware/weblogs
+OCAMLCLIBS := -linkpkg weblogs.cma
+OCAMLOPTPACKAGES := $(OCAMLCPACKAGES)
+OCAMLOPTLIBS := -linkpkg weblogs.cmxa
+endif
+
+TARGETS := ancient.cma ancient.cmxa META test_ancient.opt
+
+ifeq ($(TEST_WEBLOGS),1)
+TARGETS += test_ancient_weblogs.opt
+endif
+
+all: $(TARGETS)
ancient.cma: ancient.cmo ancient_c.o
ocamlmklib -o ancient $^
test_ancient.opt: ancient.cmxa test_ancient.cmx
LIBRARY_PATH=.:$$LIBRARY_PATH \
- ocamlfind ocamlopt $(OCAMLOPTFLAGS) $(OCAMLOPTLIBS) -o $@ $^
+ ocamlfind ocamlopt $(OCAMLOPTFLAGS) $(OCAMLOPTPACKAGES) $(OCAMLOPTLIBS) -o $@ $^
+
+ifeq ($(TEST_WEBLOGS),1)
+test_ancient_weblogs.opt: ancient.cmxa test_ancient_weblogs.cmx
+ LIBRARY_PATH=.:$$LIBRARY_PATH \
+ ocamlfind ocamlopt $(OCAMLOPTFLAGS) $(OCAMLOPTPACKAGES) $(OCAMLOPTLIBS) -o $@ $^
+endif
# Common rules for building OCaml objects.
/* Mark objects as 'ancient' so they are taken out of the OCaml heap.
- * $Id: ancient_c.c,v 1.1 2006-09-27 12:07:07 rich Exp $
+ * $Id: ancient_c.c,v 1.2 2006-09-27 14:05:07 rich Exp $
*/
#include <string.h>
// From byterun/misc.h:
typedef char * addr;
+// From byterun/minor_gc.c:
+CAMLextern char *caml_young_start;
+CAMLextern char *caml_young_end;
+#define Is_young(val) \
+ (assert (Is_block (val)), \
+ (addr)(val) < (addr)caml_young_end && (addr)(val) > (addr)caml_young_start)
+
// From byterun/major_gc.h:
#ifdef __alpha
typedef int page_table_entry;
// object's header to a special 'visited' value. However since these
// are objects in the Caml heap we have to restore the original
// headers at the end, which is the purpose of the [restore] area.
+// 4. We use realloc to allocate the memory for the copy, but because
+// the memory can move around, we cannot store absolute pointers.
+// Instead we store offsets and fix them up later. This is the
+// purpose of the [fixups] area.
//
-// XXX Recursive function will probably fall over once we apply it to
-// large, deeply recursive structures. Should be replaced with something
-// iterative.
+// XXX Large, deeply recursive structures cause a stack overflow.
+// Temporary solution: 'ulimit -s unlimited'. This function should
+// be replaced with something iterative.
static size_t
-mark (value obj, area *ptr, area *restore)
+mark (value obj, area *ptr, area *restore, area *fixups)
{
char *header = Hp_val (obj);
assert (Wosize_hp (header) > 0); // Always true? (XXX)
// We can't handle out-of-heap objects.
// XXX Since someone might try to mark an ancient object, they
// might get this error, so we should try to do better here.
- assert (Is_in_heap (obj));
+ assert (Is_young (obj) || Is_in_heap (obj));
// If we've already visited this object, just return its offset
// in the out-of-heap memory.
for (i = 0; i < nr_words; ++i) {
value field = Field (obj, i);
- if (Is_block (field)) {
- size_t field_offset = mark (field, ptr, restore);
+ if (Is_block (field) &&
+ (Is_young (field) || Is_in_heap (field))) {
+ size_t field_offset = mark (field, ptr, restore, fixups);
if (field_offset == -1) return -1; // Propagate out of memory errors.
// Since the recursive call to mark above can reallocate the
// Don't store absolute pointers yet because realloc will
// move the memory around. Store a fake pointer instead.
// We'll fix up these fake pointers afterwards.
- Field (obj_copy, i) = (field_offset + sizeof (header_t)) << 2;
+ Field (obj_copy, i) = field_offset + sizeof (header_t);
+
+ size_t fixup = (void *)&Field(obj_copy, i) - ptr->ptr;
+ area_append (fixups, &fixup, sizeof fixup);
}
}
}
}
}
+// Fixup fake pointers.
+static void
+do_fixups (area *ptr, area *fixups)
+{
+ long i;
+
+ for (i = 0; i < fixups->n; i += sizeof (size_t))
+ {
+ size_t fixup = *(size_t *)(fixups->ptr + i);
+ size_t offset = *(size_t *)(ptr->ptr + fixup);
+ void *real_ptr = ptr->ptr + offset;
+ *(value *)(ptr->ptr + fixup) = (value) real_ptr;
+ }
+}
+
CAMLprim value
ancient_mark (value obj)
{
area_init (&ptr);
area restore; // Headers to be fixed up after.
area_init (&restore);
+ area fixups; // List of fake pointers to be fixed up.
+ area_init (&fixups);
- if (mark (obj, &ptr, &restore) == -1) {
+ if (mark (obj, &ptr, &restore, &fixups) == -1) {
// Ran out of memory. Recover and throw an exception.
do_restore (&ptr, &restore);
+ area_free (&fixups);
area_free (&restore);
area_free (&ptr);
caml_failwith ("out of memory");
// Update all fake pointers in the out of heap area to make them real
// pointers.
- size_t i;
- for (i = 0; i < ptr.n; )
- {
- // Out of heap area is: header, fields, header, fields, ...
- // The header of each object tells us how many fields it has.
- char *header = ptr.ptr + i;
- size_t bytes = Bhsize_hp (header);
- value obj = Val_hp (header);
-
- int can_scan = Tag_val (obj) < No_scan_tag;
- if (can_scan) {
- mlsize_t nr_words = Wosize_hp (header);
- mlsize_t j;
-
- for (j = 0; j < nr_words; ++j) {
- value field = Field (obj, j);
-
- if (Is_block (field)) {
- size_t field_offset = field >> 2;
- void *field_ptr = ptr.ptr + field_offset;
- Field (obj, j) = (value) field_ptr;
- }
- }
- }
-
- i += bytes; // Skip to next object.
- }
+ do_fixups (&ptr, &fixups);
+ area_free (&fixups);
// Replace obj with a proxy.
proxy = caml_alloc (1, Abstract_tag);
if (Is_long (v)) caml_invalid_argument ("deleted");
// Otherwise v is a pointer to the out of heap malloc'd object.
- assert (!Is_in_heap (v));
+ assert (!Is_young (v) && !Is_in_heap (v));
free ((void *) v);
// Replace the proxy (a pointer) with an int 0 so we know it's
--- /dev/null
+(* Load in large weblogs and see if they can still be used.
+ * $Id: test_ancient_weblogs.ml,v 1.1 2006-09-27 14:05:07 rich Exp $
+ *)
+
+open Printf
+
+open ExtList
+
+let gc_stats = true (* If true, print GC stats before processing each day. *)
+
+let (//) = Filename.concat
+
+let rec range a b =
+ if a > b then []
+ else a :: range (succ a) b
+
+(* Cartesian join of two lists. *)
+let cartesian xs ys =
+ List.flatten (
+ List.map (
+ fun x ->
+ List.map (
+ fun y -> x, y
+ ) ys
+ ) xs
+ )
+
+let file_readable filename =
+ try Unix.access filename [Unix.R_OK]; true
+ with Unix.Unix_error _ -> false
+
+(* Suppress warning messages. *)
+let () = Weblogs.quiet := true
+
+let gc_compact () =
+ eprintf "compacting ... %!";
+ Gc.compact ();
+ if gc_stats then (
+ let stat = Gc.stat () in
+ let live_words = stat.Gc.live_words in
+ eprintf "live words = %d (%d MB)\n%!"
+ live_words (live_words * 8 / 1024 / 1024)
+ )
+
+(* Find the list of files. Some which should exist don't, so
+ * warnings about those so we can chase up.
+ *)
+let files =
+ let dir = "/home/rich/oversized-logfiles/perrys" in
+ let drivers =
+ [ "burns"; "gronholm"; "rohrl"; "sainz"; "solberg"; "vatanen" ] in
+ let dates = range 1 31 in
+ let dates = List.map (fun day -> sprintf "200608%02d" day) dates in
+ let files = cartesian drivers dates in
+ let files =
+ List.map (fun (driver, date) ->
+ sprintf "%s-perrys-access.log.%s.gz" driver date) files in
+ let files =
+ List.filter_map (
+ fun filename ->
+ let path = dir // filename in
+ if not (file_readable path) then (
+ prerr_endline ("warning: " ^ filename ^ " not found - ignored");
+ None
+ ) else (
+ Some path
+ )
+ ) files in
+
+ eprintf "number of files = %d\n%!" (List.length files);
+
+ files
+
+(* Load each file into memory and make it ancient. *)
+let () =
+ let files =
+ List.map (
+ fun filename ->
+ eprintf "Importing file %s\n%!" filename;
+ let rows =
+ let rows = Weblogs.import_file filename in
+ Ancient.mark rows in
+ gc_compact ();
+ rows
+ ) files in
+
+ ignore (files)
+