* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
- * $Id: bitmatch.mli,v 1.9 2008-04-02 11:06:31 rjones Exp $
+ * $Id: bitmatch.mli,v 1.20 2008-05-08 21:28:28 rjones Exp $
*)
(**
+ {{:#reference}Jump straight to the reference section for
+ documentation on types and functions}.
+
{2 Introduction}
Bitmatch adds Erlang-style bitstrings and matching over bitstrings
as a syntax extension and library for OCaml. You can use
- this module to both parse and generate binary formats.
+ this module to both parse and generate binary formats, for
+ example, communications protocols, disk formats and binary files.
+
+ {{:http://code.google.com/p/bitmatch/}OCaml bitmatch website}
{2 Examples}
| Options | Padding |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*)
- | 4 : 4; hdrlen : 4; tos : 8; length : 16;
- identification : 16; flags : 3; fragoffset : 13;
- ttl : 8; protocol : 8; checksum : 16;
- source : 32;
- dest : 32;
- options : (hdrlen-5)*32 : bitstring;
- payload : -1 : bitstring ->
+ | { 4 : 4; hdrlen : 4; tos : 8; length : 16;
+ identification : 16; flags : 3; fragoffset : 13;
+ ttl : 8; protocol : 8; checksum : 16;
+ source : 32;
+ dest : 32;
+ options : (hdrlen-5)*32 : bitstring;
+ payload : -1 : bitstring } ->
printf "IPv4:\n";
printf " header length: %d * 32 bit words\n" hdrlen;
printf " packet payload:\n";
Bitmatch.hexdump_bitstring stdout payload
- | version : 4 ->
+ | { version : 4 } ->
eprintf "unknown IP version %d\n" version;
exit 1
- | _ as pkt ->
+ | { _ } as pkt ->
eprintf "data is smaller than one nibble:\n";
Bitmatch.hexdump_bitstring stderr pkt;
exit 1
let () =
bitmatch bits with
- | s_inodes_count : 32 : littleendian; (* Inodes count *)
- s_blocks_count : 32 : littleendian; (* Blocks count *)
- s_r_blocks_count : 32 : littleendian; (* Reserved blocks count *)
- s_free_blocks_count : 32 : littleendian; (* Free blocks count *)
- s_free_inodes_count : 32 : littleendian; (* Free inodes count *)
- s_first_data_block : 32 : littleendian; (* First Data Block *)
- s_log_block_size : 32 : littleendian; (* Block size *)
- s_log_frag_size : 32 : littleendian; (* Fragment size *)
- s_blocks_per_group : 32 : littleendian; (* # Blocks per group *)
- s_frags_per_group : 32 : littleendian; (* # Fragments per group *)
- s_inodes_per_group : 32 : littleendian; (* # Inodes per group *)
- s_mtime : 32 : littleendian; (* Mount time *)
- s_wtime : 32 : littleendian; (* Write time *)
- s_mnt_count : 16 : littleendian; (* Mount count *)
- s_max_mnt_count : 16 : littleendian; (* Maximal mount count *)
- 0xef53 : 16 : littleendian -> (* Magic signature *)
+ | { s_inodes_count : 32 : littleendian; (* Inodes count *)
+ s_blocks_count : 32 : littleendian; (* Blocks count *)
+ s_r_blocks_count : 32 : littleendian; (* Reserved blocks count *)
+ s_free_blocks_count : 32 : littleendian; (* Free blocks count *)
+ s_free_inodes_count : 32 : littleendian; (* Free inodes count *)
+ s_first_data_block : 32 : littleendian; (* First Data Block *)
+ s_log_block_size : 32 : littleendian; (* Block size *)
+ s_log_frag_size : 32 : littleendian; (* Fragment size *)
+ s_blocks_per_group : 32 : littleendian; (* # Blocks per group *)
+ s_frags_per_group : 32 : littleendian; (* # Fragments per group *)
+ s_inodes_per_group : 32 : littleendian; (* # Inodes per group *)
+ s_mtime : 32 : littleendian; (* Mount time *)
+ s_wtime : 32 : littleendian; (* Write time *)
+ s_mnt_count : 16 : littleendian; (* Mount count *)
+ s_max_mnt_count : 16 : littleendian; (* Maximal mount count *)
+ 0xef53 : 16 : littleendian } -> (* Magic signature *)
printf "ext3 superblock:\n";
printf " s_inodes_count = %ld\n" s_inodes_count;
printf " s_free_inodes_count = %ld\n" s_free_inodes_count;
printf " s_free_blocks_count = %ld\n" s_free_blocks_count
- | _ ->
+ | { _ } ->
eprintf "not an ext3 superblock!\n%!";
exit 2
]}
*)
let make_message typ subtype param =
- (BITSTRING
+ (BITSTRING {
typ : 16;
subtype : 16;
- param : 32) ;;
+ param : 32
+ }) ;;
]}
{2 Loading, creating bitstrings}
{!bitstring_of_chan}.
There are also functions to create bitstrings from arbitrary data.
- See the reference below.
+ See the {{:#reference}reference} below.
{2 Matching bitstrings with patterns}
The general form of [bitmatch] is:
- [bitmatch] {i bitstring-expression} [with]
+ [bitmatch {] {i bitstring-expression} [} with]
- [|] {i pattern} [->] {i code}
+ [| {] {i pattern} [} ->] {i code}
- [|] {i pattern} [->] {i code}
+ [| {] {i pattern} [} ->] {i code}
[|] ...
match then the standard library [Match_failure] exception is
thrown.
- Patterns look a bit different from normal match patterns. The
+ Patterns look a bit different from normal match patterns. They
consist of a list of bitfields separated by [;] where each bitfield
contains a bind variable, the width (in bits) of the field, and
other information. Some example patterns:
{[
bitmatch bits with
-| version : 8; name : 8; param : 8 -> ...
+| { version : 8; name : 8; param : 8 } -> ...
(* Bitstring of at least 3 bytes. First byte is the version
number, second byte is a field called name, third byte is
a field called parameter. *)
-| flag : 1 ->
+| { flag : 1 } ->
printf "flag is %b\n" flag
(* A single flag bit (mapped into an OCaml boolean). *)
-| len : 4; data : 1+len ->
+| { len : 4; data : 1+len } ->
printf "len = %d, data = 0x%Lx\n" len data
(* A 4-bit length, followed by 1-16 bits of data, where the
length of the data is computed from len. *)
-| ipv6_source : 128 : bitstring;
- ipv6_dest : 128 : bitstring -> ...
+| { ipv6_source : 128 : bitstring;
+ ipv6_dest : 128 : bitstring } -> ...
(* IPv6 source and destination addresses. Each is 128 bits
and is mapped into a bitstring type which will be a substring
You can also add conditional when-clauses:
{[
-| version : 4
+| { version : 4 }
when version = 4 || version = 6 -> ...
(* Only match and run the code when version is 4 or 6. If
length is zero in the when-clause:
{[
-| n : 4;
- rest : -1 : bitstring
+| { n : 4;
+ rest : -1 : bitstring }
when Bitmatch.bitstring_length rest = 0 -> ...
(* Only matches exactly 4 bits. *)
but you can also match a constant, as in:
{[
-| 6 : 4 -> ...
+| { (4|6) : 4 } -> ...
+
+ (* Only matches if the first 4 bits contain either
+ the integer 4 or the integer 6. *)
+]}
+
+ One may also match on strings:
+
+{[
+| { "MAGIC" : 5*8 : string } -> ...
- (* Only matches if the first 4 bits contain the integer 6. *)
+ (* Only matches if the string "MAGIC" appears at the start
+ of the input. *)
]}
- {3 Pattern field reference}
+ {3:patternfieldreference Pattern field reference}
The exact format of each pattern field is:
expression and can use any values defined in the program, and refer
back to earlier fields (but not to later fields).
- Integers can only have lengths in the range [1..64] bits. See the
+ Integers can only have lengths in the range \[1..64\] bits. See the
{{:#integertypes}integer types} section below for how these are
mapped to the OCaml int/int32/int64 types. This is checked
at compile time if the length expression is constant, otherwise it is
signedness and endianness of the field. Permissible qualifiers are:
- [int] (field has an integer type)
+ - [string] (field is a string type)
- [bitstring] (field is a bitstring type)
- [signed] (field is signed)
- [unsigned] (field is unsigned)
bitstring and/or have a default match case:
{[
-| _ -> ...
+| { _ } -> ...
(* Default match case. *)
-| _ as pkt -> ...
+| { _ } as pkt -> ...
(* Default match case, with 'pkt' bound to the whole bitstring. *)
]}
let version = 1 ;;
let data = 10 ;;
let bits =
- BITSTRING
+ BITSTRING {
version : 4;
- data : 12 ;;
+ data : 12
+ } ;;
(* Constructs a 16-bit bitstring with the first four bits containing
the integer 1, and the following 12 bits containing the integer 10,
*)
]}
-
+ The format of each field is the same as for pattern fields (see
+ {{:#patternfieldreference}Pattern field reference section}), and
+ things like computed length fields, fixed value fields, insertion
+ of bitstrings within bitstrings, etc. are all supported.
+
+ {3 Construction exception}
+ The [BITSTRING] operator may throw a {!Construct_failure}
+ exception at runtime.
+
+ Runtime errors include:
+
+ - int field length not in the range \[1..64\]
+ - a bitstring with a length declared which doesn't have the
+ same length at runtime
+ - trying to insert an out of range value into an int field
+ (eg. an unsigned int field which is 2 bits wide can only
+ take values in the range \[0..3\]).
{2:integertypes Integer types}
+ Integer types are mapped to OCaml types [bool], [int], [int32] or
+ [int64] using a system which tries to ensure that (a) the types are
+ reasonably predictable and (b) the most efficient type is
+ preferred.
+
+ The rules are slightly different depending on whether the bit
+ length expression in the field is a compile-time constant or a
+ computed expression.
+
+ Detection of compile-time constants is quite simplistic so only an
+ simple integer literals and simple expressions (eg. [5*8]) are
+ recognized as constants.
+ In any case the bit size of an integer is limited to the range
+ \[1..64\]. This is detected as a compile-time error if that is
+ possible, otherwise a runtime check is added which can throw an
+ [Invalid_argument] exception.
+ The mapping is thus:
+ {v
+ Bit size ---- OCaml type ----
+ Constant Computed expression
+ 1 bool int64
+ 2..31 int int64
+ 32 int32 int64
+ 33..64 int64 int64
+ v}
+
+ A possible future extension may allow people with 64 bit computers
+ to specify a more optimal [int] type for bit sizes in the range
+ [32..63]. If this was implemented then such code {i could not even
+ be compiled} on 32 bit platforms, so it would limit portability.
+
+ Another future extension may be to allow computed
+ expressions to assert min/max range for the bit size,
+ allowing a more efficient data type than int64 to be
+ used. (Of course under such circumstances there would
+ still need to be a runtime check to enforce the
+ size).
{2 Compiling}
+ Using the compiler directly you can do:
+
+ {v
+ ocamlc -I +bitmatch \
+ -pp "camlp4o `ocamlc -where`/bitmatch/pa_bitmatch.cmo" \
+ bitmatch.cma test.ml -o test
+ v}
+ Simpler method using findlib:
+ {v
+ ocamlfind ocamlc \
+ -package bitmatch.syntax -syntax bitmatch.syntax \
+ -linkpkg test.ml -o test
+ v}
+ {2 Security and type safety}
- {2 Safety and security considerations}
+ {3 Security on input}
+ The main concerns for input are buffer overflows and denial
+ of service.
+ It is believed that this library is robust against attempted buffer
+ overflows. In addition to OCaml's normal bounds checks, we check
+ that field lengths are >= 0, and many additional checks.
+ Denial of service attacks are more problematic. We only work
+ forwards through the bitstring, thus computation will eventually
+ terminate. As for computed lengths, code such as this is thought
+ to be secure:
+
+ {[
+ bitmatch bits with
+ | { len : 64;
+ buffer : Int64.to_int len : bitstring } ->
+ ]}
+
+ The [len] field can be set arbitrarily large by an attacker, but
+ when pattern-matching against the [buffer] field this merely causes
+ a test such as [if len <= remaining_size] to fail. Even if the
+ length is chosen so that [buffer] bitstring is allocated, the
+ allocation of sub-bitstrings is efficient and doesn't involve an
+ arbitary-sized allocation or any copying.
+
+ However the above does not necessarily apply to strings used in
+ matching, since they may cause the library to use the
+ {!Bitmatch.string_of_bitstring} function, which allocates a string.
+ So you should take care if you use the [string] type particularly
+ with a computed length that is derived from external input.
+
+ The main protection against attackers should be to ensure that the
+ main program will only read input bitstrings up to a certain
+ length, which is outside the scope of this library.
+
+ {3 Security on output}
+
+ As with the input side, computed lengths are believed to be
+ safe. For example:
+
+ {[
+ let len = read_untrusted_source () in
+ let buffer = allocate_bitstring () in
+ BITSTRING {
+ buffer : len : bitstring
+ }
+ ]}
+
+ This code merely causes a check that buffer's length is the same as
+ [len]. However the program function [allocate_bitstring] must
+ refuse to allocate an oversized buffer (but that is outside the
+ scope of this library).
+
+ {3 Order of evaluation}
+
+ In [bitmatch] statements, fields are evaluated left to right.
+
+ Note that the when-clause is evaluated {i last}, so if you are
+ relying on the when-clause to filter cases then your code may do a
+ lot of extra and unncessary pattern-matching work on fields which
+ may never be needed just to evaluate the when-clause. You can
+ usually rearrange the code to do only the first part of the match,
+ followed by the when-clause, followed by a second inner bitmatch.
+
+ {3 Safety}
+
+ The current implementation is believed to be fully type-safe,
+ and makes compile and run-time checks where appropriate. If
+ you find a case where a check is missing please submit a
+ bug report or a patch.
{2 Limits}
+ These are thought to be the current limits:
+
+ Integers: \[1..64\] bits.
+ Bitstrings (32 bit platforms): maximum length is limited
+ by the string size, ie. 16 MBytes.
+ Bitstrings (64 bit platforms): maximum length is thought to be
+ limited by the string size, ie. effectively unlimited.
+ Bitstrings must be loaded into memory before we can match against
+ them. Thus available memory may be considered a limit for some
+ applications.
- {2 Reference}
+ {2:reference Reference}
{3 Types}
*)
The type contains the underlying data (a string),
the current bit offset within the string and the
current bit length of the string (counting from the
- bit offset). Note that the offsets are bits, not bytes.
+ bit offset). Note that the offset and length are
+ in {b bits}, not bytes.
Normally you don't need to use the bitstring type
directly, since there are functions and syntax
extensions which hide the details.
- See {!bitstring_of_file}, {!hexdump_bitstring},
- {!bitstring_length}.
+
+ See also {!bitstring_of_string}, {!bitstring_of_file},
+ {!hexdump_bitstring}, {!bitstring_length}.
*)
(** {3 Exceptions} *)
Note that the length is in bits, not bytes. *)
+val bitstring_of_string : string -> bitstring
+(** [bitstring_of_string str] creates a bitstring
+ of length [String.length str * 8] (bits) containing the
+ bits in [str].
+
+ Note that the bitstring uses [str] as the underlying
+ string (see the representation of {!bitstring}) so you
+ should not change [str] after calling this. *)
+
+val bitstring_of_file : string -> bitstring
+(** [bitstring_of_file filename] loads the named file
+ into a bitstring. *)
+
val bitstring_of_chan : in_channel -> bitstring
(** [bitstring_of_chan chan] loads the contents of
the input channel [chan] as a bitstring.
The length of the final bitstring is determined
by the remaining input in [chan], but will always
- be a multiple of 8 bits. *)
+ be a multiple of 8 bits.
-val bitstring_of_file : string -> bitstring
-(** [bitstring_of_file filename] loads the named file
- into a bitstring. *)
+ See also {!bitstring_of_chan_max}. *)
-val hexdump_bitstring : out_channel -> bitstring -> unit
-(** [hexdump_bitstring chan bitstring] prints the bitstring
- to the output channel in a format similar to the
- Unix command [hexdump -C]. *)
+val bitstring_of_chan_max : in_channel -> int -> bitstring
+(** [bitstring_of_chan_max chan max] works like
+ {!bitstring_of_chan} but will only read up to
+ [max] bytes from the channel (or fewer if the end of input
+ occurs before that). *)
+
+val bitstring_of_file_descr : Unix.file_descr -> bitstring
+(** [bitstring_of_file_descr fd] loads the contents of
+ the file descriptor [fd] as a bitstring.
+
+ See also {!bitstring_of_chan}, {!bitstring_of_file_descr_max}. *)
+
+val bitstring_of_file_descr_max : Unix.file_descr -> int -> bitstring
+(** [bitstring_of_file_descr_max fd max] works like
+ {!bitstring_of_file_descr} but will only read up to
+ [max] bytes from the channel (or fewer if the end of input
+ occurs before that). *)
val bitstring_length : bitstring -> int
(** [bitstring_length bitstring] returns the length of
the bitstring in bits. *)
+val string_of_bitstring : bitstring -> string
+(** [string_of_bitstring bitstring] converts a bitstring to a string
+ (eg. to allow comparison).
+
+ This function is inefficient. In the best case when the bitstring
+ is nicely byte-aligned we do a [String.sub] operation. If the
+ bitstring isn't aligned then this involves a lot of bit twiddling
+ and is particularly inefficient. *)
+
+(** {3 Printing bitstrings} *)
+
+val hexdump_bitstring : out_channel -> bitstring -> unit
+(** [hexdump_bitstring chan bitstring] prints the bitstring
+ to the output channel in a format similar to the
+ Unix command [hexdump -C]. *)
+
(** {3 Bitstring buffer} *)
module Buffer : sig
val extract_int64_be_unsigned : string -> int -> int -> int -> int64 * int * int
-val construct_bit : Buffer.t -> bool -> int -> unit
+val extract_int64_le_unsigned : string -> int -> int -> int -> int64 * int * int
+
+val construct_bit : Buffer.t -> bool -> int -> exn -> unit
val construct_char_unsigned : Buffer.t -> int -> int -> exn -> unit
val construct_int_be_unsigned : Buffer.t -> int -> int -> exn -> unit
+val construct_int32_be_unsigned : Buffer.t -> int32 -> int -> exn -> unit
+
val construct_int64_be_unsigned : Buffer.t -> int64 -> int -> exn -> unit
+
+val construct_string : Buffer.t -> string -> unit