From e4f09c77f0cc9335e20790a150a3539889f331de Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Thu, 31 Jan 2013 19:46:15 +0000 Subject: [PATCH] Further work on similarity. --- Makefile.am | 2 ++ cladogram.ml | 81 +++++++++++++++++++++++++++++++++++++++++++-- cladogram.mli | 4 ++- similarity.ml | 87 +++++++++---------------------------------------- virt-similarity.pod | 46 ++++++++++++++++++++++---- virt-similarity.spec.in | 2 +- 6 files changed, 139 insertions(+), 83 deletions(-) diff --git a/Makefile.am b/Makefile.am index 6429024..ad3707c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,8 @@ EXTRA_DIST = \ COPYING \ README \ config.ml.in \ + .depend \ + virt-similarity.pod \ virt-similarity.spec \ virt-similarity.spec.in \ $(SOURCES) diff --git a/cladogram.ml b/cladogram.ml index b02aed5..267d7f0 100644 --- a/cladogram.ml +++ b/cladogram.ml @@ -18,13 +18,15 @@ open Printf +open Utils + type t = | Leaf of int (* A single disk image (index of). *) - | Node of t list (* An interior node in the tree. *) + | Node of t list * int (* An interior node in the tree. *) let rec images_in_subtree = function | Leaf i -> [i] - | Node xs -> List.concat (List.map images_in_subtree xs) + | Node (xs, _) -> List.concat (List.map images_in_subtree xs) let max_list = List.fold_left max min_int @@ -35,6 +37,79 @@ let mapi f xs = in loop 0 xs +(* Compute the minimum distance between subtrees t1 and t2. 'matrix' + * is the distance matrix between leaves. + *) +let min_distance_between_subtrees matrix t1 t2 = + let min_distance = ref max_int in + + let xs = images_in_subtree t1 in + let ys = images_in_subtree t2 in + + List.iter ( + fun (i, j) -> + let d = matrix.(i).(j) in + if d < !min_distance then min_distance := d + ) (pairs xs ys); + + !min_distance + +(* Find the closest subtrees and combine them. *) +let combine_closest_subtrees matrix trees = + let trees = Array.of_list trees in + let n = Array.length trees in + + (* Find the minimum distance between any two subtrees. *) + let min_distance = ref max_int in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees matrix trees.(i) trees.(j) in + if d < !min_distance then min_distance := d + ) (pairs_of_ints n); + + let min_distance = !min_distance in + + (* For each subtree that differs from another by exactly the + * minimum distance, group them together into a single subtree. + *) + let in_group = Array.make n false in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees matrix trees.(i) trees.(j) in + if d = min_distance then ( + in_group.(i) <- true; + in_group.(j) <- true + ) + ) (pairs_of_ints n); + + let group = ref [] and rest = ref [] in + Array.iteri ( + fun i in_group -> + if in_group then + group := trees.(i) :: !group + else + rest := trees.(i) :: !rest + ) in_group; + + !rest @ [Node (!group, min_distance)] + +let construct_cladogram matrix n = + (* At the bottom level, every disk image is in its own leaf. *) + let leaves = + let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in + loop 0 in + + (* Work up the tree, combining subtrees together, until we end + * up with one tree (ie. the top node of the final tree). + *) + let rec loop trees = + match trees with + | [] -> assert false + | [x] -> x (* finished *) + | xs -> loop (combine_closest_subtrees matrix xs) + in + loop leaves + let format_cladogram ?format_leaf t = let format_leaf = match format_leaf with | None -> string_of_int @@ -44,7 +119,7 @@ let format_cladogram ?format_leaf t = | Leaf i -> let s = "--- " ^ format_leaf i in [s; ""], String.length s - | Node xs -> + | Node (xs, _) -> let xs = List.map format xs in let n = List.length xs in let w = 7 + max_list (List.map snd xs) in diff --git a/cladogram.mli b/cladogram.mli index 95d9c4e..cfd0b02 100644 --- a/cladogram.mli +++ b/cladogram.mli @@ -20,8 +20,10 @@ type t = | Leaf of int (** A single disk image (index of). *) - | Node of t list (** An interior node in the tree. *) + | Node of t list * int (** An interior node in the tree. *) val images_in_subtree : t -> int list +val construct_cladogram : int array array -> int -> t + val format_cladogram : ?format_leaf:(int -> string) -> t -> string list diff --git a/similarity.ml b/similarity.ml index 0dd52a4..de6db3e 100644 --- a/similarity.ml +++ b/similarity.ml @@ -32,7 +32,7 @@ let ( *^ ) = Int64.mul let (/^) = Int64.div (* Read command line parameters. *) -let n, filenames = +let n, filenames, debug = let display_version () = printf "%s %s (%s)\n" Config.package_name Config.package_version @@ -40,7 +40,11 @@ let n, filenames = exit 0 in + let debug = ref false in + let argspec = Arg.align [ + "--debug", Arg.Set debug, " Enable debugging"; + "-d", Arg.Set debug, " Enable debugging"; "--version", Arg.Unit display_version, " Display version number and exit"; "-V", Arg.Unit display_version, " Display version number and exit"; ] in @@ -75,7 +79,9 @@ Options: exit 1 ); - n, filenames + let debug = !debug in + + n, filenames, debug (* Read in the cache file. *) let cache = Cache.read_cache_file () @@ -110,13 +116,15 @@ let cache = let cache, hashes = match Cache.get_hash cache filename with | Some hashes -> - printf "%s: disk image is already in the cache\n" filename; + if debug then + printf "%s: disk image is already in the cache\n%!" filename; cache, hashes | None -> printf "%s: reading disk image ...\n%!" filename; let hashes = read_disk_image filename in Cache.update_hash cache filename hashes, hashes in - printf "%s: number of blocks = %d\n" filename (Array.length hashes); + if debug then + printf "%s: number of blocks = %d\n" filename (Array.length hashes); cache ) cache (Array.to_list filenames) @@ -154,7 +162,8 @@ let matrix = match hi, hj with | Some hi, Some hj -> let d = calculate_distance hi hj in - printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d; + if debug then + printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d; matrix.(i).(j) <- d; matrix.(j).(i) <- d | _ -> assert false @@ -162,73 +171,7 @@ let matrix = matrix (* Construct the tree (cladogram). *) -let cladogram = - (* At the bottom level, every disk image is in its own leaf. *) - let leaves = - let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in - loop 0 in - - (* Find the closest subtrees and combine them. *) - let rec combine_closest_subtrees trees = - let trees = Array.of_list trees in - let n = Array.length trees in - - (* Find the minimum distance between any two subtrees. *) - let min_distance = ref max_int in - List.iter ( - fun (i, j) -> - let d = min_distance_between_subtrees trees.(i) trees.(j) in - if d < !min_distance then min_distance := d - ) (pairs_of_ints n); - - let min_distance = !min_distance in - - (* For each subtree that differs from another by exactly the - * minimum distance, group them together into a single subtree. - *) - let in_group = Array.make n false in - List.iter ( - fun (i, j) -> - let d = min_distance_between_subtrees trees.(i) trees.(j) in - if d = min_distance then ( - in_group.(i) <- true; - in_group.(j) <- true - ) - ) (pairs_of_ints n); - - let group = ref [] and rest = ref [] in - Array.iteri ( - fun i in_group -> - if in_group then - group := trees.(i) :: !group - else - rest := trees.(i) :: !rest - ) in_group; - - !rest @ [Node !group] - - and min_distance_between_subtrees t1 t2 = - let min_distance = ref max_int in - - let xs = images_in_subtree t1 in - let ys = images_in_subtree t2 in - - List.iter ( - fun (i, j) -> - let d = matrix.(i).(j) in - if d < !min_distance then min_distance := d - ) (pairs xs ys); - - !min_distance - in - - let rec loop trees = - match trees with - | [] -> assert false - | [x] -> x (* finished *) - | xs -> loop (combine_closest_subtrees xs) - in - loop leaves +let cladogram = construct_cladogram matrix n let () = let format_leaf i = Filename.basename filenames.(i) in diff --git a/virt-similarity.pod b/virt-similarity.pod index a6fa3c4..d853c28 100644 --- a/virt-similarity.pod +++ b/virt-similarity.pod @@ -6,31 +6,65 @@ virt-similarity - Find clusters of similar/cloned virtual machines =head1 SYNOPSIS - + virt-similarity disk1 disk2 [disk3 ...] =head1 DESCRIPTION -Virt-similiarity is a tool for doing cluster analysis of groups of +Virt-similarity is a tool for doing cluster analysis of groups of virtual machines. It can automatically detect machines which have been cloned from each other. It can produce a "cladogram" showing the "family history" of each guest, or you can use it to create the most efficient tree of backing files which will use the least disk space. +Basic usage is simply to run virt-similarity with a list of disk +images. It will read and analyze the disk images, then print a +cladogram showing how they are related: + + $ virt-similarity winxp-copy.img winxp-copy2.img \ + f19rawhidex32.img f19rawhidex64.img archlinux20121201x64.img \ + winxp.img winxp-copy2b.img + + ---+------+------+------+------+------ winxp-copy2.img + | | | | | + | | | | +------ winxp-copy2b.img + | | | | + | | | +------+------ winxp.img + | | | | + | | | +------ winxp-copy.img + | | | + | | +------ f19rawhidex64.img + | | + | +------ f19rawhidex32.img + | + +------ archlinux20121201x64.img + +The first time you run virt-similarity, it has to read the disk images +which takes some time. On subsequent runs, the information from each +disk image is cached in a special form in a hidden file in your home +directory called C<~/.similarity-cache.v1>. You can simply delete +this cache file at any time if you want, especially if you want to +force virt-similarity to re-read the disk images. +=head1 FILES +=over 4 +=item C<~/.similarity-cache.v1> -=head1 FILES - +The cache file where virt-similarity saves information about each disk +image so that it doesn't have to re-read them each time it is run. +You can delete this file at any time. +=back =head1 ENVIRONMENT VARIABLES - +Virt-similarity uses libguestfs, and is affected by various libguestfs +environment variables. See L. =head1 SEE ALSO - +L, L =head1 AUTHOR diff --git a/virt-similarity.spec.in b/virt-similarity.spec.in index 38d190a..b0430e5 100644 --- a/virt-similarity.spec.in +++ b/virt-similarity.spec.in @@ -20,7 +20,7 @@ BuildRequires: /usr/bin/perldoc %description -Virt-similiarity is a tool for doing cluster analysis of groups of +Virt-similarity is a tool for doing cluster analysis of groups of virtual machines. It can automatically detect machines which have been cloned from each other. It can produce a "cladogram" showing the "family history" of each guest, or you can use it to create the most -- 1.8.3.1