From 5845b6fe45d4dbe13c1bb2565231e2ab897b182d Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Thu, 31 Jan 2013 15:07:20 +0000 Subject: [PATCH] Implement similarity code. --- .gitignore | 1 + Makefile.am | 25 ++++++- README | 76 ++++++++++++++++++++ cache.ml | 61 ++++++++++++++++ cache.mli | 42 +++++++++++ cladogram.ml | 70 +++++++++++++++++++ cladogram.mli | 27 ++++++++ config.ml | 22 ++++++ config.ml.in | 22 ++++++ configure.ac | 1 + hash.ml | 24 +++++++ similarity.ml | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ utils.ml | 36 ++++++++++ 13 files changed, 622 insertions(+), 3 deletions(-) create mode 100644 README create mode 100644 cache.ml create mode 100644 cache.mli create mode 100644 cladogram.ml create mode 100644 cladogram.mli create mode 100644 config.ml create mode 100644 config.ml.in create mode 100644 hash.ml create mode 100644 utils.ml diff --git a/.gitignore b/.gitignore index 97a41c3..c7bfa4a 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ Makefile /ltmain.sh /missing /stamp-h1 +/virt-similarity /virt-similarity.1 /virt-similarity.spec /virt-similarity-*.tar.gz diff --git a/Makefile.am b/Makefile.am index d1e27df..6429024 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,25 +20,44 @@ ACLOCAL_AMFLAGS = -I m4 EXTRA_DIST = \ COPYING \ README \ + config.ml.in \ virt-similarity.spec \ virt-similarity.spec.in \ $(SOURCES) CLEANFILES = *~ *.cmi *.cmo *.cmx *.cma *.cmxa virt-similarity +OCAMLPACKAGES = -package unix,guestfs +OCAMLCFLAGS = -g -warn-error CDEFLMPSUVYZX $(OCAMLPACKAGES) +OCAMLOPTFLAGS = $(OCAMLCFLAGS) + # Sources in alphabetical order. SOURCES = \ - similarity.ml + cache.mli \ + cache.ml \ + cladogram.mli \ + cladogram.ml \ + config.ml \ + hash.ml \ + similarity.ml \ + utils.ml # Objects in build order. OBJECTS = \ + config.cmo \ + utils.cmo \ + hash.cmo \ + cache.cmo \ + cladogram.cmo \ similarity.cmo -XOBJECTS = $(OBJECTS:.=cmo=.cmx) +XOBJECTS = $(OBJECTS:.cmo=.cmx) + +bin_SCRIPTS = virt-similarity if HAVE_OCAMLOPT virt-similarity: $(XOBJECTS) - $(OCAMLFIND) opt $(OCAMLCFLAGS) \ + $(OCAMLFIND) opt $(OCAMLOPTFLAGS) \ -linkpkg $(XOBJECTS) -o $@ else virt-similarity: $(OBJECTS) diff --git a/README b/README new file mode 100644 index 0000000..fb0620d --- /dev/null +++ b/README @@ -0,0 +1,76 @@ +virt-similarity: Find clusters of similar/cloned virtual machines +Copyright (C) 2013 Red Hat Inc. +====================================================================== + +Compiling from source +--------------------- + +If checking out from git, then: + autoreconf -i + +Build it: + ./configure + make + +Optionally: + sudo make install + +Requirements +------------ + +- ocaml >= 3.12.0 +- ocaml findlib +- libguestfs >= 1.14 +- ocaml libguestfs bindings + +Developers +---------- + +The upstream git repo is: + +http://git.annexia.org/?p=virt-similarity.git;a=summary + +Please send patches to the virt-tools mailing list: + +http://www.redhat.com/mailman/listinfo/virt-tools-list + +Notes on the technique used +--------------------------- + +(1) We use libguestfs to open each disk image. This allows us to get +at the raw data, in case the disk image is stored in some format like +qcow2 or vmdk. Also you could extend this program so it could +understand encrypted disks. + +http://libguestfs.org/ +http://libguestfs.org/guestfs-java.3.html + +(2) For each disk, we split it into 64K blocks and hash each block. +The reason for choosing 64K blocks is that it's the normal cluster +size for qcow2, and the block size used by qemu-img etc. The reason +for doing the hashing is so that we can compare the disk images for +similarity by holding the complete set of hashes in memory. The +hashing reduces each disk by a factor of 4096 (MD5) or 2048 (SHA-256) +times, so for example a 10 GB disk image is reduced to a more +manageable 2.5 or 5 MB. + +NB: For speed the hashes are saved in a cache file called +'similarity-cache' in the local directory. You can just delete this +file when done. + +(3) We then compare each disk image, block by block, and record the +difference between each pair of images. + +Note that we DON'T do advanced Cluster Analysis on the disk images. +There's not any point since the rebasing operation used by qemu-img +can only handle simple differences at the block level; it cannot, for +example, move blocks around or do fuzzy matches. +http://en.wikipedia.org/wiki/Cluster_analysis + +(4) We then output a tree (technically a 'Cladogram') showing a +hierarchy of guests, using a simple hierarchical clustering algorithm, +where we group the two closest guests, then that group with the next +closest guest, and so forth. + +http://en.wikipedia.org/wiki/Cladogram +http://en.wikipedia.org/wiki/Hierarchical_clustering diff --git a/cache.ml b/cache.ml new file mode 100644 index 0000000..7a802f5 --- /dev/null +++ b/cache.ml @@ -0,0 +1,61 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +open Unix +open Filename + +type filename = string +type hashes = string array +type cache = (string * hashes) list + +let (//) = Filename.concat + +let version = "1" + +let cache_file = + try Some (Sys.getenv "HOME" // ".similarity-cache.v" ^ version) + with Not_found -> None + +let read_cache_file () : cache = + match cache_file with + | None -> [] + | Some cache_file -> + try + let chan = open_in cache_file in + let v = input_value chan in + close_in chan; + v + with + _ -> [] + +let write_cache_file (cache : cache) = + match cache_file with + | None -> () + | Some cache_file -> + let tmp_name = cache_file ^ ".tmp" in + let chan = open_out tmp_name in + output_value chan cache; + close_out chan; + rename tmp_name cache_file + +let get_hash cache filename = + try Some (List.assoc filename cache) with Not_found -> None + +let update_hash cache filename hashes = + let cache = List.remove_assoc filename cache in + (filename, hashes) :: cache diff --git a/cache.mli b/cache.mli new file mode 100644 index 0000000..9bd9273 --- /dev/null +++ b/cache.mli @@ -0,0 +1,42 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +(** Handle the cache file. *) + +type cache +(** The immutable cache, which is a map of filename to {!hashes}. *) + +type hashes = string array +(** An array of block hashes. One array for each disk image. *) + +type filename = string +(** A filename. *) + +val read_cache_file : unit -> cache +(** Read the cache file. *) + +val write_cache_file : cache -> unit +(** Write the cache file. *) + +val get_hash : cache -> filename -> hashes option +(** Get the hash for a particular filename from the cache. *) + +val update_hash : cache -> filename -> hashes -> cache +(** Set or update the hashes corresponding to filename in the cache. + Note that since {!cache} is immutable, this returns a new cache + object. *) diff --git a/cladogram.ml b/cladogram.ml new file mode 100644 index 0000000..b02aed5 --- /dev/null +++ b/cladogram.ml @@ -0,0 +1,70 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +open Printf + +type t = + | Leaf of int (* A single disk image (index of). *) + | Node of t list (* An interior node in the tree. *) + +let rec images_in_subtree = function + | Leaf i -> [i] + | Node xs -> List.concat (List.map images_in_subtree xs) + +let max_list = List.fold_left max min_int + +let mapi f xs = + let rec loop i = function + | [] -> [] + | x :: xs -> let r = f i x in r :: loop (i+1) xs + in + loop 0 xs + +let format_cladogram ?format_leaf t = + let format_leaf = match format_leaf with + | None -> string_of_int + | Some f -> f + in + let rec format = function + | Leaf i -> + let s = "--- " ^ format_leaf i in + [s; ""], String.length s + | Node xs -> + let xs = List.map format xs in + let n = List.length xs in + let w = 7 + max_list (List.map snd xs) in + let xs = mapi ( + fun row (ss, _) -> + let s, ss = match ss with + | s :: ss -> s, ss + | [] -> assert false in + if row = 0 then ( + ("---+---" ^ s) :: + List.map (fun s -> " | " ^ s) ss + ) else if row < n-1 then ( + (" +---" ^ s) :: + List.map (fun s -> " | " ^ s) ss + ) else ( + (" +---" ^ s) :: + List.map (fun s -> " " ^ s) ss + ) + ) xs in + List.concat xs, w + in + let strs, _ = format t in + strs diff --git a/cladogram.mli b/cladogram.mli new file mode 100644 index 0000000..95d9c4e --- /dev/null +++ b/cladogram.mli @@ -0,0 +1,27 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +(** Cladograms. *) + +type t = + | Leaf of int (** A single disk image (index of). *) + | Node of t list (** An interior node in the tree. *) + +val images_in_subtree : t -> int list + +val format_cladogram : ?format_leaf:(int -> string) -> t -> string list diff --git a/config.ml b/config.ml new file mode 100644 index 0000000..37cdddb --- /dev/null +++ b/config.ml @@ -0,0 +1,22 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +let package_name = "virt-similarity" +let package_version = "1.0" + +let ocamlopt = "ocamlopt.opt" <> "no" diff --git a/config.ml.in b/config.ml.in new file mode 100644 index 0000000..1249c80 --- /dev/null +++ b/config.ml.in @@ -0,0 +1,22 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +let package_name = "@PACKAGE_NAME@" +let package_version = "@PACKAGE_VERSION@" + +let ocamlopt = "@OCAMLOPT@" <> "no" diff --git a/configure.ac b/configure.ac index 705e9b0..da19a0b 100644 --- a/configure.ac +++ b/configure.ac @@ -90,5 +90,6 @@ fi AC_CONFIG_HEADERS([config.h]) AC_CONFIG_FILES([Makefile + config.ml virt-similarity.spec]) AC_OUTPUT diff --git a/hash.ml b/hash.ml new file mode 100644 index 0000000..f4770ee --- /dev/null +++ b/hash.ml @@ -0,0 +1,24 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +module MD5 = struct + let algorithm_name = "MD-5" + let hash_bits_per_block = 128 + let hash_bytes_per_block = hash_bits_per_block / 8 + let digest = Digest.string +end diff --git a/similarity.ml b/similarity.ml index 561479d..0dd52a4 100644 --- a/similarity.ml +++ b/similarity.ml @@ -16,3 +16,221 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. *) +open Printf + +open Cladogram +open Utils + +let blocksize = 65536 +module HashAlgorithm = Hash.MD5 +let hash_bytes_per_block = HashAlgorithm.hash_bytes_per_block + +(* Int64 operators. *) +let (+^) = Int64.add +let (-^) = Int64.sub +let ( *^ ) = Int64.mul +let (/^) = Int64.div + +(* Read command line parameters. *) +let n, filenames = + let display_version () = + printf "%s %s (%s)\n" + Config.package_name Config.package_version + (if Config.ocamlopt then "native" else "bytecode"); + exit 0 + in + + let argspec = Arg.align [ + "--version", Arg.Unit display_version, " Display version number and exit"; + "-V", Arg.Unit display_version, " Display version number and exit"; + ] in + + let filenames = ref [] in + let collect_filenames str = filenames := str :: !filenames in + + let usage_msg = " +virt-similarity: Find clusters of similar/cloned virtual machines +Copyright (C) 2013 Red Hat Inc. + +For full documentation see the virt-similarity(1) man page. + +Usage: + + virt-similarity [options] disk.img disk.img [disk.img ...] + +You must supply at least one disk image. You can supply disk +images in most common formats (raw, qcow2, vmdk, etc.) + +Options: +" in + + (* Do the argument parsing. *) + Arg.parse argspec collect_filenames usage_msg; + + (* Check the arguments. *) + let filenames = Array.of_list (List.rev !filenames) in + let n = Array.length filenames in + if n < 2 then ( + eprintf "virt-similarity: At least two disk images must be specified.\n"; + exit 1 + ); + + n, filenames + +(* Read in the cache file. *) +let cache = Cache.read_cache_file () + +(* Read the disk images, hash them, and update the cache. *) +let read_disk_image filename = + let g = new Guestfs.guestfs () in + g#add_drive_ro filename; + g#launch (); + + let dev = "/dev/sda" in + let size = g#blockdev_getsize64 dev in + let rec loop offset last_percent accum = + if offset <= size -^ Int64.of_int blocksize then ( + let percent = 100L *^ offset /^ size in + if percent <> last_percent then printf "%Ld%%\r%!" percent; + + let block = g#pread_device dev blocksize offset in + let hash = HashAlgorithm.digest block in + loop (offset +^ Int64.of_int blocksize) percent (hash :: accum) + ) + else accum + in + let hashes_reversed = loop 0L (-1L) [] in + g#close (); + + Array.of_list (List.rev hashes_reversed) + +let cache = + List.fold_left ( + fun cache filename -> + let cache, hashes = + match Cache.get_hash cache filename with + | Some hashes -> + printf "%s: disk image is already in the cache\n" filename; + cache, hashes + | None -> + printf "%s: reading disk image ...\n%!" filename; + let hashes = read_disk_image filename in + Cache.update_hash cache filename hashes, hashes in + printf "%s: number of blocks = %d\n" filename (Array.length hashes); + cache + ) cache (Array.to_list filenames) + +(* Write the updated cache file. *) +let () = Cache.write_cache_file cache + +(* Work out the similarity for each pair of guests and store it in a + * matrix, where matrix.(i).(j) is the distance between filenames.(i) + * and filenames.(j). + *) +let hash_of_zero = + let zero_string = String.make blocksize (Char.chr 0) in + HashAlgorithm.digest zero_string + +let calculate_distance hash1 hash2 = + let hash1 = Array.to_list hash1 in + let hash2 = Array.to_list hash2 in + let rec loop = function + | [], [] -> 0 + | (x :: xs), [] when x = hash_of_zero -> loop (xs, []) + | (x :: xs), [] -> 1 + loop (xs, []) + | [], (y :: ys) when y = hash_of_zero -> loop ([], ys) + | [], (y :: ys) -> 1 + loop ([], ys) + | (x :: xs), (y :: ys) when x = y -> loop (xs, ys) + | (x :: xs), (y :: ys) -> 1 + loop (xs, ys) + in + loop (hash1, hash2) + +let matrix = + let matrix = Array.make_matrix n n 0 in + List.iter ( + fun (i, j) -> + let hi = Cache.get_hash cache filenames.(i) in + let hj = Cache.get_hash cache filenames.(j) in + match hi, hj with + | Some hi, Some hj -> + let d = calculate_distance hi hj in + printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d; + matrix.(i).(j) <- d; + matrix.(j).(i) <- d + | _ -> assert false + ) (pairs_of_ints n); + matrix + +(* Construct the tree (cladogram). *) +let cladogram = + (* At the bottom level, every disk image is in its own leaf. *) + let leaves = + let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in + loop 0 in + + (* Find the closest subtrees and combine them. *) + let rec combine_closest_subtrees trees = + let trees = Array.of_list trees in + let n = Array.length trees in + + (* Find the minimum distance between any two subtrees. *) + let min_distance = ref max_int in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees trees.(i) trees.(j) in + if d < !min_distance then min_distance := d + ) (pairs_of_ints n); + + let min_distance = !min_distance in + + (* For each subtree that differs from another by exactly the + * minimum distance, group them together into a single subtree. + *) + let in_group = Array.make n false in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees trees.(i) trees.(j) in + if d = min_distance then ( + in_group.(i) <- true; + in_group.(j) <- true + ) + ) (pairs_of_ints n); + + let group = ref [] and rest = ref [] in + Array.iteri ( + fun i in_group -> + if in_group then + group := trees.(i) :: !group + else + rest := trees.(i) :: !rest + ) in_group; + + !rest @ [Node !group] + + and min_distance_between_subtrees t1 t2 = + let min_distance = ref max_int in + + let xs = images_in_subtree t1 in + let ys = images_in_subtree t2 in + + List.iter ( + fun (i, j) -> + let d = matrix.(i).(j) in + if d < !min_distance then min_distance := d + ) (pairs xs ys); + + !min_distance + in + + let rec loop trees = + match trees with + | [] -> assert false + | [x] -> x (* finished *) + | xs -> loop (combine_closest_subtrees xs) + in + loop leaves + +let () = + let format_leaf i = Filename.basename filenames.(i) in + let lines = format_cladogram ~format_leaf cladogram in + List.iter print_endline lines diff --git a/utils.ml b/utils.ml new file mode 100644 index 0000000..6eb4965 --- /dev/null +++ b/utils.ml @@ -0,0 +1,36 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +(* Returns all unique pairs of integers (i, j) = (0..n-1, 0..n-1) where i != j. + * For example: + * 'pairs_of_ints 3' returns [ (0,1); (0,2); (1,2) ] + *) +let pairs_of_ints n = + let ret = ref [] in + for i = n-1 downto 0 do + for j = i-1 downto 0 do + ret := (j, i) :: !ret + done + done; + !ret + +(* Returns all pairs from lists 'xs' and 'ys'. *) +let pairs xs ys = + let ret = ref [] in + List.iter (fun x -> List.iter (fun y -> ret := (x, y) :: !ret) ys) xs; + !ret -- 1.8.3.1