/ltmain.sh
/missing
/stamp-h1
+/virt-similarity
/virt-similarity.1
/virt-similarity.spec
/virt-similarity-*.tar.gz
EXTRA_DIST = \
COPYING \
README \
+ config.ml.in \
virt-similarity.spec \
virt-similarity.spec.in \
$(SOURCES)
CLEANFILES = *~ *.cmi *.cmo *.cmx *.cma *.cmxa virt-similarity
+OCAMLPACKAGES = -package unix,guestfs
+OCAMLCFLAGS = -g -warn-error CDEFLMPSUVYZX $(OCAMLPACKAGES)
+OCAMLOPTFLAGS = $(OCAMLCFLAGS)
+
# Sources in alphabetical order.
SOURCES = \
- similarity.ml
+ cache.mli \
+ cache.ml \
+ cladogram.mli \
+ cladogram.ml \
+ config.ml \
+ hash.ml \
+ similarity.ml \
+ utils.ml
# Objects in build order.
OBJECTS = \
+ config.cmo \
+ utils.cmo \
+ hash.cmo \
+ cache.cmo \
+ cladogram.cmo \
similarity.cmo
-XOBJECTS = $(OBJECTS:.=cmo=.cmx)
+XOBJECTS = $(OBJECTS:.cmo=.cmx)
+
+bin_SCRIPTS = virt-similarity
if HAVE_OCAMLOPT
virt-similarity: $(XOBJECTS)
- $(OCAMLFIND) opt $(OCAMLCFLAGS) \
+ $(OCAMLFIND) opt $(OCAMLOPTFLAGS) \
-linkpkg $(XOBJECTS) -o $@
else
virt-similarity: $(OBJECTS)
--- /dev/null
+virt-similarity: Find clusters of similar/cloned virtual machines
+Copyright (C) 2013 Red Hat Inc.
+======================================================================
+
+Compiling from source
+---------------------
+
+If checking out from git, then:
+ autoreconf -i
+
+Build it:
+ ./configure
+ make
+
+Optionally:
+ sudo make install
+
+Requirements
+------------
+
+- ocaml >= 3.12.0
+- ocaml findlib
+- libguestfs >= 1.14
+- ocaml libguestfs bindings
+
+Developers
+----------
+
+The upstream git repo is:
+
+http://git.annexia.org/?p=virt-similarity.git;a=summary
+
+Please send patches to the virt-tools mailing list:
+
+http://www.redhat.com/mailman/listinfo/virt-tools-list
+
+Notes on the technique used
+---------------------------
+
+(1) We use libguestfs to open each disk image. This allows us to get
+at the raw data, in case the disk image is stored in some format like
+qcow2 or vmdk. Also you could extend this program so it could
+understand encrypted disks.
+
+http://libguestfs.org/
+http://libguestfs.org/guestfs-java.3.html
+
+(2) For each disk, we split it into 64K blocks and hash each block.
+The reason for choosing 64K blocks is that it's the normal cluster
+size for qcow2, and the block size used by qemu-img etc. The reason
+for doing the hashing is so that we can compare the disk images for
+similarity by holding the complete set of hashes in memory. The
+hashing reduces each disk by a factor of 4096 (MD5) or 2048 (SHA-256)
+times, so for example a 10 GB disk image is reduced to a more
+manageable 2.5 or 5 MB.
+
+NB: For speed the hashes are saved in a cache file called
+'similarity-cache' in the local directory. You can just delete this
+file when done.
+
+(3) We then compare each disk image, block by block, and record the
+difference between each pair of images.
+
+Note that we DON'T do advanced Cluster Analysis on the disk images.
+There's not any point since the rebasing operation used by qemu-img
+can only handle simple differences at the block level; it cannot, for
+example, move blocks around or do fuzzy matches.
+http://en.wikipedia.org/wiki/Cluster_analysis
+
+(4) We then output a tree (technically a 'Cladogram') showing a
+hierarchy of guests, using a simple hierarchical clustering algorithm,
+where we group the two closest guests, then that group with the next
+closest guest, and so forth.
+
+http://en.wikipedia.org/wiki/Cladogram
+http://en.wikipedia.org/wiki/Hierarchical_clustering
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+open Unix
+open Filename
+
+type filename = string
+type hashes = string array
+type cache = (string * hashes) list
+
+let (//) = Filename.concat
+
+let version = "1"
+
+let cache_file =
+ try Some (Sys.getenv "HOME" // ".similarity-cache.v" ^ version)
+ with Not_found -> None
+
+let read_cache_file () : cache =
+ match cache_file with
+ | None -> []
+ | Some cache_file ->
+ try
+ let chan = open_in cache_file in
+ let v = input_value chan in
+ close_in chan;
+ v
+ with
+ _ -> []
+
+let write_cache_file (cache : cache) =
+ match cache_file with
+ | None -> ()
+ | Some cache_file ->
+ let tmp_name = cache_file ^ ".tmp" in
+ let chan = open_out tmp_name in
+ output_value chan cache;
+ close_out chan;
+ rename tmp_name cache_file
+
+let get_hash cache filename =
+ try Some (List.assoc filename cache) with Not_found -> None
+
+let update_hash cache filename hashes =
+ let cache = List.remove_assoc filename cache in
+ (filename, hashes) :: cache
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+(** Handle the cache file. *)
+
+type cache
+(** The immutable cache, which is a map of filename to {!hashes}. *)
+
+type hashes = string array
+(** An array of block hashes. One array for each disk image. *)
+
+type filename = string
+(** A filename. *)
+
+val read_cache_file : unit -> cache
+(** Read the cache file. *)
+
+val write_cache_file : cache -> unit
+(** Write the cache file. *)
+
+val get_hash : cache -> filename -> hashes option
+(** Get the hash for a particular filename from the cache. *)
+
+val update_hash : cache -> filename -> hashes -> cache
+(** Set or update the hashes corresponding to filename in the cache.
+ Note that since {!cache} is immutable, this returns a new cache
+ object. *)
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+open Printf
+
+type t =
+ | Leaf of int (* A single disk image (index of). *)
+ | Node of t list (* An interior node in the tree. *)
+
+let rec images_in_subtree = function
+ | Leaf i -> [i]
+ | Node xs -> List.concat (List.map images_in_subtree xs)
+
+let max_list = List.fold_left max min_int
+
+let mapi f xs =
+ let rec loop i = function
+ | [] -> []
+ | x :: xs -> let r = f i x in r :: loop (i+1) xs
+ in
+ loop 0 xs
+
+let format_cladogram ?format_leaf t =
+ let format_leaf = match format_leaf with
+ | None -> string_of_int
+ | Some f -> f
+ in
+ let rec format = function
+ | Leaf i ->
+ let s = "--- " ^ format_leaf i in
+ [s; ""], String.length s
+ | Node xs ->
+ let xs = List.map format xs in
+ let n = List.length xs in
+ let w = 7 + max_list (List.map snd xs) in
+ let xs = mapi (
+ fun row (ss, _) ->
+ let s, ss = match ss with
+ | s :: ss -> s, ss
+ | [] -> assert false in
+ if row = 0 then (
+ ("---+---" ^ s) ::
+ List.map (fun s -> " | " ^ s) ss
+ ) else if row < n-1 then (
+ (" +---" ^ s) ::
+ List.map (fun s -> " | " ^ s) ss
+ ) else (
+ (" +---" ^ s) ::
+ List.map (fun s -> " " ^ s) ss
+ )
+ ) xs in
+ List.concat xs, w
+ in
+ let strs, _ = format t in
+ strs
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+(** Cladograms. *)
+
+type t =
+ | Leaf of int (** A single disk image (index of). *)
+ | Node of t list (** An interior node in the tree. *)
+
+val images_in_subtree : t -> int list
+
+val format_cladogram : ?format_leaf:(int -> string) -> t -> string list
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+let package_name = "virt-similarity"
+let package_version = "1.0"
+
+let ocamlopt = "ocamlopt.opt" <> "no"
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+let package_name = "@PACKAGE_NAME@"
+let package_version = "@PACKAGE_VERSION@"
+
+let ocamlopt = "@OCAMLOPT@" <> "no"
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES([Makefile
+ config.ml
virt-similarity.spec])
AC_OUTPUT
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+module MD5 = struct
+ let algorithm_name = "MD-5"
+ let hash_bits_per_block = 128
+ let hash_bytes_per_block = hash_bits_per_block / 8
+ let digest = Digest.string
+end
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*)
+open Printf
+
+open Cladogram
+open Utils
+
+let blocksize = 65536
+module HashAlgorithm = Hash.MD5
+let hash_bytes_per_block = HashAlgorithm.hash_bytes_per_block
+
+(* Int64 operators. *)
+let (+^) = Int64.add
+let (-^) = Int64.sub
+let ( *^ ) = Int64.mul
+let (/^) = Int64.div
+
+(* Read command line parameters. *)
+let n, filenames =
+ let display_version () =
+ printf "%s %s (%s)\n"
+ Config.package_name Config.package_version
+ (if Config.ocamlopt then "native" else "bytecode");
+ exit 0
+ in
+
+ let argspec = Arg.align [
+ "--version", Arg.Unit display_version, " Display version number and exit";
+ "-V", Arg.Unit display_version, " Display version number and exit";
+ ] in
+
+ let filenames = ref [] in
+ let collect_filenames str = filenames := str :: !filenames in
+
+ let usage_msg = "
+virt-similarity: Find clusters of similar/cloned virtual machines
+Copyright (C) 2013 Red Hat Inc.
+
+For full documentation see the virt-similarity(1) man page.
+
+Usage:
+
+ virt-similarity [options] disk.img disk.img [disk.img ...]
+
+You must supply at least one disk image. You can supply disk
+images in most common formats (raw, qcow2, vmdk, etc.)
+
+Options:
+" in
+
+ (* Do the argument parsing. *)
+ Arg.parse argspec collect_filenames usage_msg;
+
+ (* Check the arguments. *)
+ let filenames = Array.of_list (List.rev !filenames) in
+ let n = Array.length filenames in
+ if n < 2 then (
+ eprintf "virt-similarity: At least two disk images must be specified.\n";
+ exit 1
+ );
+
+ n, filenames
+
+(* Read in the cache file. *)
+let cache = Cache.read_cache_file ()
+
+(* Read the disk images, hash them, and update the cache. *)
+let read_disk_image filename =
+ let g = new Guestfs.guestfs () in
+ g#add_drive_ro filename;
+ g#launch ();
+
+ let dev = "/dev/sda" in
+ let size = g#blockdev_getsize64 dev in
+ let rec loop offset last_percent accum =
+ if offset <= size -^ Int64.of_int blocksize then (
+ let percent = 100L *^ offset /^ size in
+ if percent <> last_percent then printf "%Ld%%\r%!" percent;
+
+ let block = g#pread_device dev blocksize offset in
+ let hash = HashAlgorithm.digest block in
+ loop (offset +^ Int64.of_int blocksize) percent (hash :: accum)
+ )
+ else accum
+ in
+ let hashes_reversed = loop 0L (-1L) [] in
+ g#close ();
+
+ Array.of_list (List.rev hashes_reversed)
+
+let cache =
+ List.fold_left (
+ fun cache filename ->
+ let cache, hashes =
+ match Cache.get_hash cache filename with
+ | Some hashes ->
+ printf "%s: disk image is already in the cache\n" filename;
+ cache, hashes
+ | None ->
+ printf "%s: reading disk image ...\n%!" filename;
+ let hashes = read_disk_image filename in
+ Cache.update_hash cache filename hashes, hashes in
+ printf "%s: number of blocks = %d\n" filename (Array.length hashes);
+ cache
+ ) cache (Array.to_list filenames)
+
+(* Write the updated cache file. *)
+let () = Cache.write_cache_file cache
+
+(* Work out the similarity for each pair of guests and store it in a
+ * matrix, where matrix.(i).(j) is the distance between filenames.(i)
+ * and filenames.(j).
+ *)
+let hash_of_zero =
+ let zero_string = String.make blocksize (Char.chr 0) in
+ HashAlgorithm.digest zero_string
+
+let calculate_distance hash1 hash2 =
+ let hash1 = Array.to_list hash1 in
+ let hash2 = Array.to_list hash2 in
+ let rec loop = function
+ | [], [] -> 0
+ | (x :: xs), [] when x = hash_of_zero -> loop (xs, [])
+ | (x :: xs), [] -> 1 + loop (xs, [])
+ | [], (y :: ys) when y = hash_of_zero -> loop ([], ys)
+ | [], (y :: ys) -> 1 + loop ([], ys)
+ | (x :: xs), (y :: ys) when x = y -> loop (xs, ys)
+ | (x :: xs), (y :: ys) -> 1 + loop (xs, ys)
+ in
+ loop (hash1, hash2)
+
+let matrix =
+ let matrix = Array.make_matrix n n 0 in
+ List.iter (
+ fun (i, j) ->
+ let hi = Cache.get_hash cache filenames.(i) in
+ let hj = Cache.get_hash cache filenames.(j) in
+ match hi, hj with
+ | Some hi, Some hj ->
+ let d = calculate_distance hi hj in
+ printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d;
+ matrix.(i).(j) <- d;
+ matrix.(j).(i) <- d
+ | _ -> assert false
+ ) (pairs_of_ints n);
+ matrix
+
+(* Construct the tree (cladogram). *)
+let cladogram =
+ (* At the bottom level, every disk image is in its own leaf. *)
+ let leaves =
+ let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in
+ loop 0 in
+
+ (* Find the closest subtrees and combine them. *)
+ let rec combine_closest_subtrees trees =
+ let trees = Array.of_list trees in
+ let n = Array.length trees in
+
+ (* Find the minimum distance between any two subtrees. *)
+ let min_distance = ref max_int in
+ List.iter (
+ fun (i, j) ->
+ let d = min_distance_between_subtrees trees.(i) trees.(j) in
+ if d < !min_distance then min_distance := d
+ ) (pairs_of_ints n);
+
+ let min_distance = !min_distance in
+
+ (* For each subtree that differs from another by exactly the
+ * minimum distance, group them together into a single subtree.
+ *)
+ let in_group = Array.make n false in
+ List.iter (
+ fun (i, j) ->
+ let d = min_distance_between_subtrees trees.(i) trees.(j) in
+ if d = min_distance then (
+ in_group.(i) <- true;
+ in_group.(j) <- true
+ )
+ ) (pairs_of_ints n);
+
+ let group = ref [] and rest = ref [] in
+ Array.iteri (
+ fun i in_group ->
+ if in_group then
+ group := trees.(i) :: !group
+ else
+ rest := trees.(i) :: !rest
+ ) in_group;
+
+ !rest @ [Node !group]
+
+ and min_distance_between_subtrees t1 t2 =
+ let min_distance = ref max_int in
+
+ let xs = images_in_subtree t1 in
+ let ys = images_in_subtree t2 in
+
+ List.iter (
+ fun (i, j) ->
+ let d = matrix.(i).(j) in
+ if d < !min_distance then min_distance := d
+ ) (pairs xs ys);
+
+ !min_distance
+ in
+
+ let rec loop trees =
+ match trees with
+ | [] -> assert false
+ | [x] -> x (* finished *)
+ | xs -> loop (combine_closest_subtrees xs)
+ in
+ loop leaves
+
+let () =
+ let format_leaf i = Filename.basename filenames.(i) in
+ let lines = format_cladogram ~format_leaf cladogram in
+ List.iter print_endline lines
--- /dev/null
+(* virt-similarity
+ * Copyright (C) 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+(* Returns all unique pairs of integers (i, j) = (0..n-1, 0..n-1) where i != j.
+ * For example:
+ * 'pairs_of_ints 3' returns [ (0,1); (0,2); (1,2) ]
+ *)
+let pairs_of_ints n =
+ let ret = ref [] in
+ for i = n-1 downto 0 do
+ for j = i-1 downto 0 do
+ ret := (j, i) :: !ret
+ done
+ done;
+ !ret
+
+(* Returns all pairs from lists 'xs' and 'ys'. *)
+let pairs xs ys =
+ let ret = ref [] in
+ List.iter (fun x -> List.iter (fun y -> ret := (x, y) :: !ret) ys) xs;
+ !ret