COPYING \
README \
config.ml.in \
+ .depend \
+ virt-similarity.pod \
virt-similarity.spec \
virt-similarity.spec.in \
$(SOURCES)
open Printf
+open Utils
+
type t =
| Leaf of int (* A single disk image (index of). *)
- | Node of t list (* An interior node in the tree. *)
+ | Node of t list * int (* An interior node in the tree. *)
let rec images_in_subtree = function
| Leaf i -> [i]
- | Node xs -> List.concat (List.map images_in_subtree xs)
+ | Node (xs, _) -> List.concat (List.map images_in_subtree xs)
let max_list = List.fold_left max min_int
in
loop 0 xs
+(* Compute the minimum distance between subtrees t1 and t2. 'matrix'
+ * is the distance matrix between leaves.
+ *)
+let min_distance_between_subtrees matrix t1 t2 =
+ let min_distance = ref max_int in
+
+ let xs = images_in_subtree t1 in
+ let ys = images_in_subtree t2 in
+
+ List.iter (
+ fun (i, j) ->
+ let d = matrix.(i).(j) in
+ if d < !min_distance then min_distance := d
+ ) (pairs xs ys);
+
+ !min_distance
+
+(* Find the closest subtrees and combine them. *)
+let combine_closest_subtrees matrix trees =
+ let trees = Array.of_list trees in
+ let n = Array.length trees in
+
+ (* Find the minimum distance between any two subtrees. *)
+ let min_distance = ref max_int in
+ List.iter (
+ fun (i, j) ->
+ let d = min_distance_between_subtrees matrix trees.(i) trees.(j) in
+ if d < !min_distance then min_distance := d
+ ) (pairs_of_ints n);
+
+ let min_distance = !min_distance in
+
+ (* For each subtree that differs from another by exactly the
+ * minimum distance, group them together into a single subtree.
+ *)
+ let in_group = Array.make n false in
+ List.iter (
+ fun (i, j) ->
+ let d = min_distance_between_subtrees matrix trees.(i) trees.(j) in
+ if d = min_distance then (
+ in_group.(i) <- true;
+ in_group.(j) <- true
+ )
+ ) (pairs_of_ints n);
+
+ let group = ref [] and rest = ref [] in
+ Array.iteri (
+ fun i in_group ->
+ if in_group then
+ group := trees.(i) :: !group
+ else
+ rest := trees.(i) :: !rest
+ ) in_group;
+
+ !rest @ [Node (!group, min_distance)]
+
+let construct_cladogram matrix n =
+ (* At the bottom level, every disk image is in its own leaf. *)
+ let leaves =
+ let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in
+ loop 0 in
+
+ (* Work up the tree, combining subtrees together, until we end
+ * up with one tree (ie. the top node of the final tree).
+ *)
+ let rec loop trees =
+ match trees with
+ | [] -> assert false
+ | [x] -> x (* finished *)
+ | xs -> loop (combine_closest_subtrees matrix xs)
+ in
+ loop leaves
+
let format_cladogram ?format_leaf t =
let format_leaf = match format_leaf with
| None -> string_of_int
| Leaf i ->
let s = "--- " ^ format_leaf i in
[s; ""], String.length s
- | Node xs ->
+ | Node (xs, _) ->
let xs = List.map format xs in
let n = List.length xs in
let w = 7 + max_list (List.map snd xs) in
type t =
| Leaf of int (** A single disk image (index of). *)
- | Node of t list (** An interior node in the tree. *)
+ | Node of t list * int (** An interior node in the tree. *)
val images_in_subtree : t -> int list
+val construct_cladogram : int array array -> int -> t
+
val format_cladogram : ?format_leaf:(int -> string) -> t -> string list
let (/^) = Int64.div
(* Read command line parameters. *)
-let n, filenames =
+let n, filenames, debug =
let display_version () =
printf "%s %s (%s)\n"
Config.package_name Config.package_version
exit 0
in
+ let debug = ref false in
+
let argspec = Arg.align [
+ "--debug", Arg.Set debug, " Enable debugging";
+ "-d", Arg.Set debug, " Enable debugging";
"--version", Arg.Unit display_version, " Display version number and exit";
"-V", Arg.Unit display_version, " Display version number and exit";
] in
exit 1
);
- n, filenames
+ let debug = !debug in
+
+ n, filenames, debug
(* Read in the cache file. *)
let cache = Cache.read_cache_file ()
let cache, hashes =
match Cache.get_hash cache filename with
| Some hashes ->
- printf "%s: disk image is already in the cache\n" filename;
+ if debug then
+ printf "%s: disk image is already in the cache\n%!" filename;
cache, hashes
| None ->
printf "%s: reading disk image ...\n%!" filename;
let hashes = read_disk_image filename in
Cache.update_hash cache filename hashes, hashes in
- printf "%s: number of blocks = %d\n" filename (Array.length hashes);
+ if debug then
+ printf "%s: number of blocks = %d\n" filename (Array.length hashes);
cache
) cache (Array.to_list filenames)
match hi, hj with
| Some hi, Some hj ->
let d = calculate_distance hi hj in
- printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d;
+ if debug then
+ printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d;
matrix.(i).(j) <- d;
matrix.(j).(i) <- d
| _ -> assert false
matrix
(* Construct the tree (cladogram). *)
-let cladogram =
- (* At the bottom level, every disk image is in its own leaf. *)
- let leaves =
- let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in
- loop 0 in
-
- (* Find the closest subtrees and combine them. *)
- let rec combine_closest_subtrees trees =
- let trees = Array.of_list trees in
- let n = Array.length trees in
-
- (* Find the minimum distance between any two subtrees. *)
- let min_distance = ref max_int in
- List.iter (
- fun (i, j) ->
- let d = min_distance_between_subtrees trees.(i) trees.(j) in
- if d < !min_distance then min_distance := d
- ) (pairs_of_ints n);
-
- let min_distance = !min_distance in
-
- (* For each subtree that differs from another by exactly the
- * minimum distance, group them together into a single subtree.
- *)
- let in_group = Array.make n false in
- List.iter (
- fun (i, j) ->
- let d = min_distance_between_subtrees trees.(i) trees.(j) in
- if d = min_distance then (
- in_group.(i) <- true;
- in_group.(j) <- true
- )
- ) (pairs_of_ints n);
-
- let group = ref [] and rest = ref [] in
- Array.iteri (
- fun i in_group ->
- if in_group then
- group := trees.(i) :: !group
- else
- rest := trees.(i) :: !rest
- ) in_group;
-
- !rest @ [Node !group]
-
- and min_distance_between_subtrees t1 t2 =
- let min_distance = ref max_int in
-
- let xs = images_in_subtree t1 in
- let ys = images_in_subtree t2 in
-
- List.iter (
- fun (i, j) ->
- let d = matrix.(i).(j) in
- if d < !min_distance then min_distance := d
- ) (pairs xs ys);
-
- !min_distance
- in
-
- let rec loop trees =
- match trees with
- | [] -> assert false
- | [x] -> x (* finished *)
- | xs -> loop (combine_closest_subtrees xs)
- in
- loop leaves
+let cladogram = construct_cladogram matrix n
let () =
let format_leaf i = Filename.basename filenames.(i) in
=head1 SYNOPSIS
-
+ virt-similarity disk1 disk2 [disk3 ...]
=head1 DESCRIPTION
-Virt-similiarity is a tool for doing cluster analysis of groups of
+Virt-similarity is a tool for doing cluster analysis of groups of
virtual machines. It can automatically detect machines which have
been cloned from each other. It can produce a "cladogram" showing the
"family history" of each guest, or you can use it to create the most
efficient tree of backing files which will use the least disk space.
+Basic usage is simply to run virt-similarity with a list of disk
+images. It will read and analyze the disk images, then print a
+cladogram showing how they are related:
+
+ $ virt-similarity winxp-copy.img winxp-copy2.img \
+ f19rawhidex32.img f19rawhidex64.img archlinux20121201x64.img \
+ winxp.img winxp-copy2b.img
+
+ ---+------+------+------+------+------ winxp-copy2.img
+ | | | | |
+ | | | | +------ winxp-copy2b.img
+ | | | |
+ | | | +------+------ winxp.img
+ | | | |
+ | | | +------ winxp-copy.img
+ | | |
+ | | +------ f19rawhidex64.img
+ | |
+ | +------ f19rawhidex32.img
+ |
+ +------ archlinux20121201x64.img
+
+The first time you run virt-similarity, it has to read the disk images
+which takes some time. On subsequent runs, the information from each
+disk image is cached in a special form in a hidden file in your home
+directory called C<~/.similarity-cache.v1>. You can simply delete
+this cache file at any time if you want, especially if you want to
+force virt-similarity to re-read the disk images.
+=head1 FILES
+=over 4
+=item C<~/.similarity-cache.v1>
-=head1 FILES
-
+The cache file where virt-similarity saves information about each disk
+image so that it doesn't have to re-read them each time it is run.
+You can delete this file at any time.
+=back
=head1 ENVIRONMENT VARIABLES
-
+Virt-similarity uses libguestfs, and is affected by various libguestfs
+environment variables. See L<guestfs(3)/ENVIRONMENT VARIABLES>.
=head1 SEE ALSO
-
+L<guestfs(3)>, L<http://virt-tools.org/>
=head1 AUTHOR
%description
-Virt-similiarity is a tool for doing cluster analysis of groups of
+Virt-similarity is a tool for doing cluster analysis of groups of
virtual machines. It can automatically detect machines which have
been cloned from each other. It can produce a "cladogram" showing the
"family history" of each guest, or you can use it to create the most