X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;ds=sidebyside;f=cladogram.ml;fp=cladogram.ml;h=267d7f0565f1350be82ca97c54080eb5628c0df1;hb=e4f09c77f0cc9335e20790a150a3539889f331de;hp=b02aed5b5042112739d0feb60d1676cf0eec3602;hpb=5845b6fe45d4dbe13c1bb2565231e2ab897b182d;p=virt-similarity.git diff --git a/cladogram.ml b/cladogram.ml index b02aed5..267d7f0 100644 --- a/cladogram.ml +++ b/cladogram.ml @@ -18,13 +18,15 @@ open Printf +open Utils + type t = | Leaf of int (* A single disk image (index of). *) - | Node of t list (* An interior node in the tree. *) + | Node of t list * int (* An interior node in the tree. *) let rec images_in_subtree = function | Leaf i -> [i] - | Node xs -> List.concat (List.map images_in_subtree xs) + | Node (xs, _) -> List.concat (List.map images_in_subtree xs) let max_list = List.fold_left max min_int @@ -35,6 +37,79 @@ let mapi f xs = in loop 0 xs +(* Compute the minimum distance between subtrees t1 and t2. 'matrix' + * is the distance matrix between leaves. + *) +let min_distance_between_subtrees matrix t1 t2 = + let min_distance = ref max_int in + + let xs = images_in_subtree t1 in + let ys = images_in_subtree t2 in + + List.iter ( + fun (i, j) -> + let d = matrix.(i).(j) in + if d < !min_distance then min_distance := d + ) (pairs xs ys); + + !min_distance + +(* Find the closest subtrees and combine them. *) +let combine_closest_subtrees matrix trees = + let trees = Array.of_list trees in + let n = Array.length trees in + + (* Find the minimum distance between any two subtrees. *) + let min_distance = ref max_int in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees matrix trees.(i) trees.(j) in + if d < !min_distance then min_distance := d + ) (pairs_of_ints n); + + let min_distance = !min_distance in + + (* For each subtree that differs from another by exactly the + * minimum distance, group them together into a single subtree. + *) + let in_group = Array.make n false in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees matrix trees.(i) trees.(j) in + if d = min_distance then ( + in_group.(i) <- true; + in_group.(j) <- true + ) + ) (pairs_of_ints n); + + let group = ref [] and rest = ref [] in + Array.iteri ( + fun i in_group -> + if in_group then + group := trees.(i) :: !group + else + rest := trees.(i) :: !rest + ) in_group; + + !rest @ [Node (!group, min_distance)] + +let construct_cladogram matrix n = + (* At the bottom level, every disk image is in its own leaf. *) + let leaves = + let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in + loop 0 in + + (* Work up the tree, combining subtrees together, until we end + * up with one tree (ie. the top node of the final tree). + *) + let rec loop trees = + match trees with + | [] -> assert false + | [x] -> x (* finished *) + | xs -> loop (combine_closest_subtrees matrix xs) + in + loop leaves + let format_cladogram ?format_leaf t = let format_leaf = match format_leaf with | None -> string_of_int @@ -44,7 +119,7 @@ let format_cladogram ?format_leaf t = | Leaf i -> let s = "--- " ^ format_leaf i in [s; ""], String.length s - | Node xs -> + | Node (xs, _) -> let xs = List.map format xs in let n = List.length xs in let w = 7 + max_list (List.map snd xs) in