Merge branch 'chromeos-verity-3.4' into chromeos-3.4
authorOlof Johansson <olof@lixom.net>
Fri, 1 Jun 2012 07:06:58 +0000 (00:06 -0700)
committerOlof Johansson <olof@lixom.net>
Fri, 1 Jun 2012 07:06:58 +0000 (00:06 -0700)
By Mandeep Singh Baines (21) and others
* chromeos-verity-3.4: (48 commits)
  CHROMIUM: verity: Add stats on block sizes
  CHROMIUM: verity: Add a debugfs interface with basic stats access
  CHROMIUM: verity: Add a few comments about the statistics variables
  CHROMIUM: verity: Add a name for each verity instance
  CHROMIUM: verity: make total_requeues a uint64
  CHROMIUM: verity: honor optional salt argument.
  CHROMIUM: dm-bht: add salt support.
  CHROMIUM: init: don't dm_substitute_devices().
  CHROMIUM: verity: embed hash_desc instead of allocating it
  CHROMIUM: verity: use one shared queue per processer
  CHROMIUM: dm-verity: optionally support key-val args
  CHROMIUM: verity: use alloc_page instead of mempool_alloc
  CHROMIUM: verity: convert depth to an int
  CHROMIUM: verity: statically allocate root_digest
  CHROMIUM: verity: use block instead of block_index everywhere
  CHROMIUM: verity: short-circuit dm_bht_populate
  CHROMIUM: verity: cleanup commenting of io_bht_populate
  CHROMIUM: verity: change type (u64) for io block and count
  CHROMIUM: verity: root hash should not rely on uninitialized memory
  CHROMIUM: verity: use high priority workqueues
  ...

19 files changed:
Documentation/device-mapper/boot.txt [new file with mode: 0644]
Documentation/device-mapper/dm-bht.txt [new file with mode: 0644]
Documentation/device-mapper/dm-verity.txt [new file with mode: 0644]
Documentation/device-mapper/verity.txt [deleted file]
Documentation/kernel-parameters.txt
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/dm-bht.c [new file with mode: 0644]
drivers/md/dm-ioctl.c
drivers/md/dm-table.c
drivers/md/dm-verity-chromeos.c [new file with mode: 0644]
drivers/md/dm-verity.c
drivers/md/dm-verity.h [new file with mode: 0644]
include/linux/device-mapper.h
include/linux/dm-bht.h [new file with mode: 0644]
init/Makefile
init/do_mounts.c
init/do_mounts.h
init/do_mounts_dm.c [new file with mode: 0644]

diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt
new file mode 100644 (file)
index 0000000..adcaad5
--- /dev/null
@@ -0,0 +1,42 @@
+Boot time creation of mapped devices
+===================================
+
+It is possible to configure a device mapper device to act as the root
+device for your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal
+userspace which configures the device, then pivot_root(8) in to it.
+
+For simple device mapper configurations, it is possible to boot directly
+using the following kernel command line:
+
+dm="<name> <uuid> <ro>,table line 1,...,table line n"
+
+name = the name to associate with the device
+       after boot, udev, if used, will use that name to label
+       the device node.
+uuid = may be 'none' or the UUID desired for the device.
+ro = may be "ro" or "rw".  If "ro", the device and device table will be
+       marked read-only.
+
+Each table line may be as normal when using the dmsetup tool except for
+two variations:
+1. Any use of commas will be interpreted as a newline
+2. Quotation marks cannot be escaped and cannot be used without
+   terminating the dm= argument.
+
+Unless renamed by udev, the device node created will be dm-0 as the
+first minor number for the device-mapper is used during early creation.
+
+Example
+=======
+
+- Booting to a linear array made up of user-mode linux block devices:
+
+  dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \
+  root=/dev/dm-0
+
+Will boot to a rw dm-linear target of 8192 sectors split across two
+block devices identified by their major:minor numbers.  After boot, udev
+will rename this target to /dev/mapper/lroot (depending on the rules).
+No uuid was assigned.
diff --git a/Documentation/device-mapper/dm-bht.txt b/Documentation/device-mapper/dm-bht.txt
new file mode 100644 (file)
index 0000000..69bab02
--- /dev/null
@@ -0,0 +1,73 @@
+dm-bht
+======
+
+dm-bht provides a block hash tree implementation.  The use of dm-bht allows
+for integrity checking of a given block device without reading the entire
+set of blocks into memory before use.
+
+In particular, dm-bht supplies an interface for creating and verifying a tree
+of cryptographic digests with any algorithm supported by the kernel crypto API.
+
+The code is meant to be usable from user-space for creation and verification as
+well as directly from a Device-Mapper target.  The `verity' target is the
+motivating example.
+
+
+Theory of operation
+===================
+
+dm-bht is logically comprised of multiple nodes organized in a tree-like
+structure.  Each node in the tree is a cryptographic hash.  If it is a leaf
+node, the hash is of some block data on disk.  If it is an intermediary node,
+then the hash is of a number of child nodes.
+
+dm-bht has a given depth starting at 1 (ignoring the root node).  Each level in
+the tree is concretely made up of dm_bht_entry structs.  Each entry in the tree
+is a collection of neighboring nodes that fit in one page-sized block.  The
+number is determined based on PAGE_SIZE and the size of the selected
+cryptographic digest algorithm.  The hashes are linearly ordered in this entry
+and any unaligned trailing space is ignored but included when calculating the
+parent node.
+
+The tree looks something like:
+
+depth = 2, alg= sha256, num_blocks = 32767
+                                 [   root    ]
+                                /    . . .    \
+                     [entry_0]                 [entry_1]
+                    /  . . .  \                 . . .   \
+         [entry_0_0]   . . .  [entry_0_127]    . . . .  [entry_1_127]
+           / ... \             /   . . .  \             /           \
+     blk_0 ... blk_127  blk_16256   blk_16383      blk_32640 . . . blk_32767
+
+root is treated independently from the depth and the blocks are expected to
+be hashed and supplied to the dm-bht.  hash blocks that make up the entry
+contents are expected to be read from disk.
+
+dm-bht does not handle I/O directly but instead expects the consumer to
+supply callbacks.  The read callback will always receive a page-align value
+to pass to the block device layer to read in a hash value.
+
+Usage
+=====
+
+The API provides mechanisms for reading and verifying a tree as well as
+creating and modifying the tree.  These two code paths were not meant to be
+used in parallel and modify the atomic entry values in incompatible ways.
+Where possible, tree creation and modification should be handled independently
+from tree verification.
+
+When reading, all required data for the hash tree should be populated for a
+block before attempting a verify.  This can be done by calling
+dm_bht_populate().  When all data is ready, a call to dm_bht_verify_block()
+with the expected hash value will perform both the direct block hash check and
+the hashes of the parent and neighboring nodes where needed to ensure validity
+up to the root hash.  Note, dm_bht_set_root_hexdigest() should be called before
+any verification attempts occur.
+
+When updating the tree, all block hashes should be stored with
+dm_bht_store_block().  Once all hashes are stored, a call to dm_bht_compute()
+will initiate a full tree update by walking all of the blocks of hashes
+starting at the leaf nodes and computing upward to the root node.  On
+completion, dm_bht_sync() may be called to write the tree to disk (or wherever
+the callback writes to).
diff --git a/Documentation/device-mapper/dm-verity.txt b/Documentation/device-mapper/dm-verity.txt
new file mode 100644 (file)
index 0000000..ee471d9
--- /dev/null
@@ -0,0 +1,71 @@
+dm-verity
+==========
+
+Device-Mapper's "verity" target provides transparent integrity checking of
+block devices using a cryptographic digest provided by the kernel crypto API.
+This target is read-only.
+
+Parameters: <device path> <hash device path> <tree depth> <alg> <parent-hash>
+
+<device path>
+    This is the device that is going to be integrity checked.  It may be
+    a subset of the full device as specified to dmsetup (start sector and count)
+    It may be specified as a path, like /dev/sdaX, or a device number,
+    <major>:<minor>.
+
+<hash device path>
+    This is the device that that supplies the dm-bht hash data.  It may be
+    specified similarly to the device path and may be the same device.  If the
+    same device is used, the hash offset should be outside of the dm-verity
+    configured device size.
+
+<tree depth>
+    The tree depth determines how many levels of hashes are used when building
+    the tree of hashes.  The root of the tree not included and the leaves of
+    the tree are the hashes of the blocks on disk.
+
+<alg>
+    The cryptographic hash algorithm used for this device.  This should
+    be the name of the algorithm, like "sha1".
+
+<root hash>
+    The hexadecimal encoding of the cryptographic hash of all of the
+    neighboring nodes at the first level of the tree.  This hash should be
+    trusted as there is no other authenticity beyond this point.
+
+
+Theory of operation
+===================
+
+dm-verity is meant to be setup as part of a verified boot path.  This
+may be anything ranging from a boot using tboot or trustedgrub to just
+booting from a known-good device (like a USB drive or CD).
+
+When a dm-verity device is configured, it is expected that the caller
+has been authenticated in some way (cryptographic signatures, etc).
+After instantiation, all hashes will be verified on-demand during
+disk access.  If they cannot be verified up to the root node of the
+tree, the root hash, then the I/O will fail.  This should identify
+tampering with any data on the device and the hash data.
+
+Cryptographic hashes are used to assert the integrity of the device on a
+per-block basis.  This allows for a lightweight hash computation on first read
+into the page cache.  Block hashes are stored linearly aligned to the nearest
+block the size of a page.
+
+For more information on the hashing process, see dm-bht.txt.
+
+
+Example
+=======
+
+Setup a device;
+[[
+  dmsetup create vroot --table \
+    "0 204800 verity /dev/sda1 /dev/sda2 0 3 sha1 "\
+    "9f74809a2ee7607b16fcc70d9399a4de9725a727"
+]]
+
+A command line tool is available to compute the hash tree and return the
+root hash value.
+  http://git.chromium.org/cgi-bin/gitweb.cgi?p=dm-verity.git;a=tree
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
deleted file mode 100644 (file)
index 32e4879..0000000
+++ /dev/null
@@ -1,194 +0,0 @@
-dm-verity
-==========
-
-Device-Mapper's "verity" target provides transparent integrity checking of
-block devices using a cryptographic digest provided by the kernel crypto API.
-This target is read-only.
-
-Construction Parameters
-=======================
-    <version> <dev> <hash_dev> <hash_start>
-    <data_block_size> <hash_block_size>
-    <num_data_blocks> <hash_start_block>
-    <algorithm> <digest> <salt>
-
-<version>
-    This is the version number of the on-disk format.
-
-    0 is the original format used in the Chromium OS.
-       The salt is appended when hashing, digests are stored continuously and
-       the rest of the block is padded with zeros.
-
-    1 is the current format that should be used for new devices.
-       The salt is prepended when hashing and each digest is
-       padded with zeros to the power of two.
-
-<dev>
-    This is the device containing the data the integrity of which needs to be
-    checked.  It may be specified as a path, like /dev/sdaX, or a device number,
-    <major>:<minor>.
-
-<hash_dev>
-    This is the device that that supplies the hash tree data.  It may be
-    specified similarly to the device path and may be the same device.  If the
-    same device is used, the hash_start should be outside of the dm-verity
-    configured device size.
-
-<data_block_size>
-    The block size on a data device.  Each block corresponds to one digest on
-    the hash device.
-
-<hash_block_size>
-    The size of a hash block.
-
-<num_data_blocks>
-    The number of data blocks on the data device.  Additional blocks are
-    inaccessible.  You can place hashes to the same partition as data, in this
-    case hashes are placed after <num_data_blocks>.
-
-<hash_start_block>
-    This is the offset, in <hash_block_size>-blocks, from the start of hash_dev
-    to the root block of the hash tree.
-
-<algorithm>
-    The cryptographic hash algorithm used for this device.  This should
-    be the name of the algorithm, like "sha1".
-
-<digest>
-    The hexadecimal encoding of the cryptographic hash of the root hash block
-    and the salt.  This hash should be trusted as there is no other authenticity
-    beyond this point.
-
-<salt>
-    The hexadecimal encoding of the salt value.
-
-Theory of operation
-===================
-
-dm-verity is meant to be setup as part of a verified boot path.  This
-may be anything ranging from a boot using tboot or trustedgrub to just
-booting from a known-good device (like a USB drive or CD).
-
-When a dm-verity device is configured, it is expected that the caller
-has been authenticated in some way (cryptographic signatures, etc).
-After instantiation, all hashes will be verified on-demand during
-disk access.  If they cannot be verified up to the root node of the
-tree, the root hash, then the I/O will fail.  This should identify
-tampering with any data on the device and the hash data.
-
-Cryptographic hashes are used to assert the integrity of the device on a
-per-block basis.  This allows for a lightweight hash computation on first read
-into the page cache.  Block hashes are stored linearly-aligned to the nearest
-block the size of a page.
-
-Hash Tree
----------
-
-Each node in the tree is a cryptographic hash.  If it is a leaf node, the hash
-is of some block data on disk.  If it is an intermediary node, then the hash is
-of a number of child nodes.
-
-Each entry in the tree is a collection of neighboring nodes that fit in one
-block.  The number is determined based on block_size and the size of the
-selected cryptographic digest algorithm.  The hashes are linearly-ordered in
-this entry and any unaligned trailing space is ignored but included when
-calculating the parent node.
-
-The tree looks something like:
-
-alg = sha256, num_blocks = 32768, block_size = 4096
-
-                                 [   root    ]
-                                /    . . .    \
-                     [entry_0]                 [entry_1]
-                    /  . . .  \                 . . .   \
-         [entry_0_0]   . . .  [entry_0_127]    . . . .  [entry_1_127]
-           / ... \             /   . . .  \             /           \
-     blk_0 ... blk_127  blk_16256   blk_16383      blk_32640 . . . blk_32767
-
-
-On-disk format
-==============
-
-Below is the recommended on-disk format. The verity kernel code does not
-read the on-disk header. It only reads the hash blocks which directly
-follow the header. It is expected that a user-space tool will verify the
-integrity of the verity_header and then call dmsetup with the correct
-parameters. Alternatively, the header can be omitted and the dmsetup
-parameters can be passed via the kernel command-line in a rooted chain
-of trust where the command-line is verified.
-
-The on-disk format is especially useful in cases where the hash blocks
-are on a separate partition. The magic number allows easy identification
-of the partition contents. Alternatively, the hash blocks can be stored
-in the same partition as the data to be verified. In such a configuration
-the filesystem on the partition would be sized a little smaller than
-the full-partition, leaving room for the hash blocks.
-
-struct superblock {
-       uint8_t signature[8]
-               "verity\0\0";
-
-       uint8_t version;
-               1 - current format
-
-       uint8_t data_block_bits;
-               log2(data block size)
-
-       uint8_t hash_block_bits;
-               log2(hash block size)
-
-       uint8_t pad1[1];
-               zero padding
-
-       uint16_t salt_size;
-               big-endian salt size
-
-       uint8_t pad2[2];
-               zero padding
-
-       uint32_t data_blocks_hi;
-               big-endian high 32 bits of the 64-bit number of data blocks
-
-       uint32_t data_blocks_lo;
-               big-endian low 32 bits of the 64-bit number of data blocks
-
-       uint8_t algorithm[16];
-               cryptographic algorithm
-
-       uint8_t salt[384];
-               salt (the salt size is specified above)
-
-       uint8_t pad3[88];
-               zero padding to 512-byte boundary
-}
-
-Directly following the header (and with sector number padded to the next hash
-block boundary) are the hash blocks which are stored a depth at a time
-(starting from the root), sorted in order of increasing index.
-
-Status
-======
-V (for Valid) is returned if every check performed so far was valid.
-If any check failed, C (for Corruption) is returned.
-
-Example
-=======
-
-Setup a device:
-  dmsetup create vroot --table \
-    "0 2097152 "\
-    "verity 1 /dev/sda1 /dev/sda2 4096 4096 2097152 1 "\
-    "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\
-    "1234000000000000000000000000000000000000000000000000000000000000"
-
-A command line tool veritysetup is available to compute or verify
-the hash tree or activate the kernel driver.  This is available from
-the LVM2 upstream repository and may be supplied as a package called
-device-mapper-verity-tools:
-    git://sources.redhat.com/git/lvm2
-    http://sourceware.org/git/?p=lvm2.git
-    http://sourceware.org/cgi-bin/cvsweb.cgi/LVM2/verity?cvsroot=lvm2
-
-veritysetup -a vroot /dev/sda1 /dev/sda2 \
-       4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076
index 695bd1e..8eb7152 100644 (file)
@@ -44,6 +44,7 @@ parameter is applicable:
        AVR32   AVR32 architecture is enabled.
        AX25    Appropriate AX.25 support is enabled.
        BLACKFIN Blackfin architecture is enabled.
+       DM      Device mapper support is enabled.
        DRM     Direct Rendering Management support is enabled.
        DYNAMIC_DEBUG Build in debug messages and enable them at runtime
        EDD     BIOS Enhanced Disk Drive Services (EDD) is enabled
@@ -702,6 +703,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        Disable PIN 1 of APIC timer
                        Can be useful to work around chipset bugs.
 
+       dm=             [DM] Allows early creation of a device-mapper device.
+                       See Documentation/device-mapper/boot.txt.
+
+       dmasound=       [HW,OSS] Sound subsystem buffers
+
        dma_debug=off   If the kernel is compiled with DMA_API_DEBUG support,
                        this option disables the debugging code at boot.
 
index 10f122a..2fcf2ae 100644 (file)
@@ -237,6 +237,53 @@ config DM_CRYPT
 
          If unsure, say N.
 
+config DM_BHT
+       tristate "Block hash tree support"
+       select CRYPTO
+       select CRYPTO_HASH
+       ---help---
+         Include support for device-mapper devices to use a block hash
+         tree for managing data integrity checks in a scalable way.
+
+         Targets that use this functionality should include it
+         automatically.
+
+         If unsure, say N.
+
+config DM_VERITY
+       tristate "Verity target support"
+       depends on BLK_DEV_DM
+       select DM_BHT
+       select CRYPTO
+       select CRYPTO_HASH
+       ---help---
+         This device-mapper target allows you to create a device that
+         transparently integrity checks the data on it. You'll need to
+         activate the digests you're going to use in the cryptoapi
+         configuration.
+
+         Information on how to use dm-verity can be found on
+
+         <http://dev.chromium.org/chromium-os/chromiumos-design-docs/verified-boot>
+
+         To compile this code as a module, choose M here: the module will
+         be called dm-verity.
+
+         If unsure, say N.
+
+config DM_VERITY_CHROMEOS
+       bool "Support Chrome OS specific verity error behavior"
+       depends on DM_VERITY
+       ---help---
+         Enables Chrome OS platform-specific error behavior.  In particular,
+         it will modify the partition preceding the verified block device
+         when non-transient error occurs (followed by a panic).
+
+         This module relies on linux/chromeos_platform.h and will behave
+         reasonably if it only supplies the stubs.
+
+         If unsure, say N.
+
 config DM_SNAPSHOT
        tristate "Snapshot target"
        depends on BLK_DEV_DM
@@ -370,24 +417,4 @@ config DM_FLAKEY
        ---help---
          A target that intermittently fails I/O for debugging purposes.
 
-config DM_VERITY
-       tristate "Verity target support (EXPERIMENTAL)"
-       depends on BLK_DEV_DM && EXPERIMENTAL
-       select CRYPTO
-       select CRYPTO_HASH
-       select DM_BUFIO
-       ---help---
-         This device-mapper target creates a read-only device that
-         transparently validates the data on one underlying device against
-         a pre-generated tree of cryptographic checksums stored on a second
-         device.
-
-         You'll need to activate the digests you're going to use in the
-         cryptoapi configuration.
-
-         To compile this code as a module, choose M here: the module will
-         be called dm-verity.
-
-         If unsure, say N.
-
 endif # MD
index 8b2e0df..4dd9712 100644 (file)
@@ -30,6 +30,9 @@ obj-$(CONFIG_BLK_DEV_MD)      += md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)       += dm-mod.o
 obj-$(CONFIG_DM_BUFIO)         += dm-bufio.o
 obj-$(CONFIG_DM_CRYPT)         += dm-crypt.o
+obj-$(CONFIG_DM_BHT)           += dm-bht.o
+obj-$(CONFIG_DM_VERITY)                += dm-verity.o
+obj-$(CONFIG_DM_VERITY_CHROMEOS)               += dm-verity-chromeos.o
 obj-$(CONFIG_DM_DELAY)         += dm-delay.o
 obj-$(CONFIG_DM_FLAKEY)                += dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)     += dm-multipath.o dm-round-robin.o
@@ -42,7 +45,6 @@ obj-$(CONFIG_DM_LOG_USERSPACE)        += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)          += dm-zero.o
 obj-$(CONFIG_DM_RAID)  += dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING)     += dm-thin-pool.o
-obj-$(CONFIG_DM_VERITY)                += dm-verity.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                    += dm-uevent.o
diff --git a/drivers/md/dm-bht.c b/drivers/md/dm-bht.c
new file mode 100644 (file)
index 0000000..539847f
--- /dev/null
@@ -0,0 +1,980 @@
+ /*
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *
+ * Device-Mapper block hash tree interface.
+ * See Documentation/device-mapper/dm-bht.txt for details.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <linux/bitops.h>  /* for fls() */
+#include <linux/bug.h>
+#include <linux/cpumask.h>  /* nr_cpu_ids */
+/* #define CONFIG_DM_DEBUG 1 */
+#include <linux/device-mapper.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/dm-bht.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm_types.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>  /* k*alloc */
+#include <linux/string.h>  /* memset */
+
+#define DM_MSG_PREFIX "dm bht"
+
+/* For sector formatting. */
+#if defined(_LP64) || defined(__LP64__) || __BITS_PER_LONG == 64
+#define __PRIS_PREFIX "z"
+#else
+#define __PRIS_PREFIX "ll"
+#endif
+#define PRIu64 __PRIS_PREFIX "u"
+
+
+/*-----------------------------------------------
+ * Utilities
+ *-----------------------------------------------*/
+
+static u8 from_hex(u8 ch)
+{
+       if ((ch >= '0') && (ch <= '9'))
+               return ch - '0';
+       if ((ch >= 'a') && (ch <= 'f'))
+               return ch - 'a' + 10;
+       if ((ch >= 'A') && (ch <= 'F'))
+               return ch - 'A' + 10;
+       return -1;
+}
+
+/**
+ * dm_bht_bin_to_hex - converts a binary stream to human-readable hex
+ * @binary:    a byte array of length @binary_len
+ * @hex:       a byte array of length @binary_len * 2 + 1
+ */
+static void dm_bht_bin_to_hex(u8 *binary, u8 *hex, unsigned int binary_len)
+{
+       while (binary_len-- > 0) {
+               sprintf((char *__restrict__)hex, "%02hhx", (int)*binary);
+               hex += 2;
+               binary++;
+       }
+}
+
+/**
+ * dm_bht_hex_to_bin - converts a hex stream to binary
+ * @binary:    a byte array of length @binary_len
+ * @hex:       a byte array of length @binary_len * 2 + 1
+ */
+static void dm_bht_hex_to_bin(u8 *binary, const u8 *hex,
+                             unsigned int binary_len)
+{
+       while (binary_len-- > 0) {
+               *binary = from_hex(*(hex++));
+               *binary *= 16;
+               *binary += from_hex(*(hex++));
+               binary++;
+       }
+}
+
+static void dm_bht_log_mismatch(struct dm_bht *bht, u8 *given, u8 *computed)
+{
+       u8 given_hex[DM_BHT_MAX_DIGEST_SIZE * 2 + 1];
+       u8 computed_hex[DM_BHT_MAX_DIGEST_SIZE * 2 + 1];
+       dm_bht_bin_to_hex(given, given_hex, bht->digest_size);
+       dm_bht_bin_to_hex(computed, computed_hex, bht->digest_size);
+       DMERR_LIMIT("%s != %s", given_hex, computed_hex);
+}
+
+/* Used for turning verifiers into computers */
+typedef int (*dm_bht_compare_cb)(struct dm_bht *, u8 *, u8 *);
+
+/**
+ * dm_bht_compute_hash: hashes a page of data
+ */
+static int dm_bht_compute_hash(struct dm_bht *bht, struct page *pg,
+                              unsigned int offset, u8 *digest)
+{
+       struct hash_desc *hash_desc = &bht->hash_desc[smp_processor_id()];
+       struct scatterlist sg;
+
+       sg_init_table(&sg, 1);
+       sg_set_page(&sg, pg, PAGE_SIZE, offset);
+       /* Note, this is synchronous. */
+       if (crypto_hash_init(hash_desc)) {
+               DMCRIT("failed to reinitialize crypto hash (proc:%d)",
+                       smp_processor_id());
+               return -EINVAL;
+       }
+       if (crypto_hash_update(hash_desc, &sg, PAGE_SIZE)) {
+               DMCRIT("crypto_hash_update failed");
+               return -EINVAL;
+       }
+       if (bht->have_salt) {
+               sg_set_buf(&sg, bht->salt, sizeof(bht->salt));
+               if (crypto_hash_update(hash_desc, &sg, sizeof(bht->salt))) {
+                       DMCRIT("crypto_hash_update failed");
+                       return -EINVAL;
+               }
+       }
+       if (crypto_hash_final(hash_desc, digest)) {
+               DMCRIT("crypto_hash_final failed");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static __always_inline struct dm_bht_level *dm_bht_get_level(struct dm_bht *bht,
+                                                            int depth)
+{
+       return &bht->levels[depth];
+}
+
+static __always_inline unsigned int dm_bht_get_level_shift(struct dm_bht *bht,
+                                                          int depth)
+{
+       return (bht->depth - depth) * bht->node_count_shift;
+}
+
+/* For the given depth, this is the entry index.  At depth+1 it is the node
+ * index for depth.
+ */
+static __always_inline unsigned int dm_bht_index_at_level(struct dm_bht *bht,
+                                                         int depth,
+                                                         unsigned int leaf)
+{
+       return leaf >> dm_bht_get_level_shift(bht, depth);
+}
+
+static __always_inline u8 *dm_bht_node(struct dm_bht *bht,
+                                      struct dm_bht_entry *entry,
+                                      unsigned int node_index)
+{
+       return &entry->nodes[node_index * bht->digest_size];
+}
+
+static inline struct dm_bht_entry *dm_bht_get_entry(struct dm_bht *bht,
+                                                   int depth,
+                                                   unsigned int block)
+{
+       unsigned int index = dm_bht_index_at_level(bht, depth, block);
+       struct dm_bht_level *level = dm_bht_get_level(bht, depth);
+
+       BUG_ON(index >= level->count);
+
+       return &level->entries[index];
+}
+
+static inline u8 *dm_bht_get_node(struct dm_bht *bht,
+                                 struct dm_bht_entry *entry,
+                                 int depth,
+                                 unsigned int block)
+{
+       unsigned int index = dm_bht_index_at_level(bht, depth, block);
+
+       return dm_bht_node(bht, entry, index % bht->node_count);
+}
+
+
+/*-----------------------------------------------
+ * Implementation functions
+ *-----------------------------------------------*/
+
+static int dm_bht_initialize_entries(struct dm_bht *bht);
+
+static int dm_bht_read_callback_stub(void *ctx, sector_t start, u8 *dst,
+                                    sector_t count,
+                                    struct dm_bht_entry *entry);
+static int dm_bht_write_callback_stub(void *ctx, sector_t start,
+                                     u8 *dst, sector_t count,
+                                     struct dm_bht_entry *entry);
+
+/**
+ * dm_bht_create - prepares @bht for us
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @depth:     tree depth without the root; including block hashes
+ * @block_count:the number of block hashes / tree leaves
+ * @alg_name:  crypto hash algorithm name
+ *
+ * Returns 0 on success.
+ *
+ * Callers can offset into devices by storing the data in the io callbacks.
+ * TODO(wad) bust up into smaller helpers
+ */
+int dm_bht_create(struct dm_bht *bht, unsigned int block_count,
+                 const char *alg_name)
+{
+       int status = 0;
+       int cpu = 0;
+
+       bht->have_salt = false;
+
+       /* Setup the hash first. Its length determines much of the bht layout */
+       for (cpu = 0; cpu < nr_cpu_ids; ++cpu) {
+               bht->hash_desc[cpu].tfm = crypto_alloc_hash(alg_name, 0, 0);
+               if (IS_ERR(bht->hash_desc[cpu].tfm)) {
+                       DMERR("failed to allocate crypto hash '%s'", alg_name);
+                       status = -ENOMEM;
+                       bht->hash_desc[cpu].tfm = NULL;
+                       goto bad_hash_alg;
+               }
+       }
+       bht->digest_size = crypto_hash_digestsize(bht->hash_desc[0].tfm);
+       /* We expect to be able to pack >=2 hashes into a page */
+       if (PAGE_SIZE / bht->digest_size < 2) {
+               DMERR("too few hashes fit in a page");
+               status = -EINVAL;
+               goto bad_digest_len;
+       }
+
+       if (bht->digest_size > DM_BHT_MAX_DIGEST_SIZE) {
+               DMERR("DM_BHT_MAX_DIGEST_SIZE too small for chosen digest");
+               status = -EINVAL;
+               goto bad_digest_len;
+       }
+
+       /* Configure the tree */
+       bht->block_count = block_count;
+       DMDEBUG("Setting block_count %u", block_count);
+       if (block_count == 0) {
+               DMERR("block_count must be non-zero");
+               status = -EINVAL;
+               goto bad_block_count;
+       }
+
+       /* Each dm_bht_entry->nodes is one page.  The node code tracks
+        * how many nodes fit into one entry where a node is a single
+        * hash (message digest).
+        */
+       bht->node_count_shift = fls(PAGE_SIZE / bht->digest_size) - 1;
+       /* Round down to the nearest power of two.  This makes indexing
+        * into the tree much less painful.
+        */
+       bht->node_count = 1 << bht->node_count_shift;
+
+       /* This is unlikely to happen, but with 64k pages, who knows. */
+       if (bht->node_count > UINT_MAX / bht->digest_size) {
+               DMERR("node_count * hash_len exceeds UINT_MAX!");
+               status = -EINVAL;
+               goto bad_node_count;
+       }
+
+       bht->depth = DIV_ROUND_UP(fls(block_count - 1), bht->node_count_shift);
+       DMDEBUG("Setting depth to %d.", bht->depth);
+
+       /* Ensure that we can safely shift by this value. */
+       if (bht->depth * bht->node_count_shift >= sizeof(unsigned int) * 8) {
+               DMERR("specified depth and node_count_shift is too large");
+               status = -EINVAL;
+               goto bad_node_count;
+       }
+
+       /* Allocate levels. Each level of the tree may have an arbitrary number
+        * of dm_bht_entry structs.  Each entry contains node_count nodes.
+        * Each node in the tree is a cryptographic digest of either node_count
+        * nodes on the subsequent level or of a specific block on disk.
+        */
+       bht->levels = (struct dm_bht_level *)
+                       kcalloc(bht->depth,
+                               sizeof(struct dm_bht_level), GFP_KERNEL);
+       if (!bht->levels) {
+               DMERR("failed to allocate tree levels");
+               status = -ENOMEM;
+               goto bad_level_alloc;
+       }
+
+       /* Setup callback stubs */
+       bht->read_cb = &dm_bht_read_callback_stub;
+       bht->write_cb = &dm_bht_write_callback_stub;
+
+       status = dm_bht_initialize_entries(bht);
+       if (status)
+               goto bad_entries_alloc;
+
+       /* We compute depth such that there is only be 1 block at level 0. */
+       BUG_ON(bht->levels[0].count != 1);
+
+       return 0;
+
+bad_entries_alloc:
+       while (bht->depth-- > 0)
+               kfree(bht->levels[bht->depth].entries);
+       kfree(bht->levels);
+bad_node_count:
+bad_level_alloc:
+bad_block_count:
+bad_digest_len:
+bad_hash_alg:
+       for (cpu = 0; cpu < nr_cpu_ids; ++cpu)
+               if (bht->hash_desc[cpu].tfm)
+                       crypto_free_hash(bht->hash_desc[cpu].tfm);
+       return status;
+}
+EXPORT_SYMBOL(dm_bht_create);
+
+static int dm_bht_initialize_entries(struct dm_bht *bht)
+{
+       /* The last_index represents the index into the last
+        * block digest that will be stored in the tree.  By walking the
+        * tree with that index, it is possible to compute the total number
+        * of entries needed at each level in the tree.
+        *
+        * Since each entry will contain up to |node_count| nodes of the tree,
+        * it is possible that the last index may not be at the end of a given
+        * entry->nodes.  In that case, it is assumed the value is padded.
+        *
+        * Note, we treat both the tree root (1 hash) and the tree leaves
+        * independently from the bht data structures.  Logically, the root is
+        * depth=-1 and the block layer level is depth=bht->depth
+        */
+       unsigned int last_index = ALIGN(bht->block_count, bht->node_count) - 1;
+       unsigned int total_entries = 0;
+       struct dm_bht_level *level = NULL;
+       int depth;
+
+       /* check that the largest level->count can't result in an int overflow
+        * on allocation or sector calculation.
+        */
+       if (((last_index >> bht->node_count_shift) + 1) >
+           UINT_MAX / max((unsigned int)sizeof(struct dm_bht_entry),
+                          (unsigned int)to_sector(PAGE_SIZE))) {
+               DMCRIT("required entries %u is too large",
+                      last_index + 1);
+               return -EINVAL;
+       }
+
+       /* Track the current sector location for each level so we don't have to
+        * compute it during traversals.
+        */
+       bht->sectors = 0;
+       for (depth = 0; depth < bht->depth; ++depth) {
+               level = dm_bht_get_level(bht, depth);
+               level->count = dm_bht_index_at_level(bht, depth,
+                                                    last_index) + 1;
+               DMDEBUG("depth: %d entries: %u", depth, level->count);
+               /* TODO(wad) consider the case where the data stored for each
+                * level is done with contiguous pages (instead of using
+                * entry->nodes) and the level just contains two bitmaps:
+                * (a) which pages have been loaded from disk
+                * (b) which specific nodes have been verified.
+                */
+               level->entries = (struct dm_bht_entry *)
+                                kcalloc(level->count,
+                                        sizeof(struct dm_bht_entry),
+                                        GFP_KERNEL);
+               if (!level->entries) {
+                       DMERR("failed to allocate entries for depth %d",
+                             bht->depth);
+                       /* let the caller clean up the mess */
+                       return -ENOMEM;
+               }
+               total_entries += level->count;
+               level->sector = bht->sectors;
+               /* number of sectors per entry * entries at this level */
+               bht->sectors += level->count * to_sector(PAGE_SIZE);
+               /* not ideal, but since unsigned overflow behavior is defined */
+               if (bht->sectors < level->sector) {
+                       DMCRIT("level sector calculation overflowed");
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static int dm_bht_read_callback_stub(void *ctx, sector_t start, u8 *dst,
+                                    sector_t count, struct dm_bht_entry *entry)
+{
+       DMCRIT("dm_bht_read_callback_stub called!");
+       dm_bht_read_completed(entry, -EIO);
+       return -EIO;
+}
+
+static int dm_bht_write_callback_stub(void *ctx, sector_t start,
+                                     u8 *dst, sector_t count,
+                                     struct dm_bht_entry *entry)
+{
+       DMCRIT("dm_bht_write_callback_stub called!");
+       dm_bht_write_completed(entry, -EIO);
+       return -EIO;
+}
+
+/**
+ * dm_bht_read_completed
+ * @entry:     pointer to the entry that's been loaded
+ * @status:    I/O status. Non-zero is failure.
+ * MUST always be called after a read_cb completes.
+ */
+void dm_bht_read_completed(struct dm_bht_entry *entry, int status)
+{
+       if (status) {
+               /* TODO(wad) add retry support */
+               DMCRIT("an I/O error occurred while reading entry");
+               atomic_set(&entry->state, DM_BHT_ENTRY_ERROR_IO);
+               /* entry->nodes will be freed later */
+               return;
+       }
+       BUG_ON(atomic_read(&entry->state) != DM_BHT_ENTRY_PENDING);
+       atomic_set(&entry->state, DM_BHT_ENTRY_READY);
+}
+EXPORT_SYMBOL(dm_bht_read_completed);
+
+/**
+ * dm_bht_write_completed
+ * @entry:     pointer to the entry that's been loaded
+ * @status:    I/O status. Non-zero is failure.
+ * Should be called after a write_cb completes. Currently only catches
+ * errors which more writers don't care about.
+ */
+void dm_bht_write_completed(struct dm_bht_entry *entry, int status)
+{
+       if (status) {
+               DMCRIT("an I/O error occurred while writing entry");
+               atomic_set(&entry->state, DM_BHT_ENTRY_ERROR_IO);
+               /* entry->nodes will be freed later */
+               return;
+       }
+}
+EXPORT_SYMBOL(dm_bht_write_completed);
+
+/* dm_bht_verify_path
+ * Verifies the path. Returns 0 on ok.
+ */
+static int dm_bht_verify_path(struct dm_bht *bht, unsigned int block,
+                             struct page *pg, unsigned int offset)
+{
+       int depth = bht->depth;
+       u8 digest[DM_BHT_MAX_DIGEST_SIZE];
+       struct dm_bht_entry *entry;
+       u8 *node;
+       int state;
+
+       do {
+               /* Need to check that the hash of the current block is accurate
+                * in its parent.
+                */
+               entry = dm_bht_get_entry(bht, depth - 1, block);
+               state = atomic_read(&entry->state);
+               /* This call is only safe if all nodes along the path
+                * are already populated (i.e. READY) via dm_bht_populate.
+                */
+               BUG_ON(state < DM_BHT_ENTRY_READY);
+               node = dm_bht_get_node(bht, entry, depth, block);
+
+               if (dm_bht_compute_hash(bht, pg, offset, digest) ||
+                   memcmp(digest, node, bht->digest_size))
+                       goto mismatch;
+
+               /* Keep the containing block of hashes to be verified in the
+                * next pass.
+                */
+               pg = virt_to_page(entry->nodes);
+               offset = 0;
+       } while (--depth > 0 && state != DM_BHT_ENTRY_VERIFIED);
+
+       if (depth == 0 && state != DM_BHT_ENTRY_VERIFIED) {
+               if (dm_bht_compute_hash(bht, pg, offset, digest) ||
+                   memcmp(digest, bht->root_digest, bht->digest_size))
+                       goto mismatch;
+               atomic_set(&entry->state, DM_BHT_ENTRY_VERIFIED);
+       }
+
+       /* Mark path to leaf as verified. */
+       for (depth++; depth < bht->depth; depth++) {
+               entry = dm_bht_get_entry(bht, depth, block);
+               /* At this point, entry can only be in VERIFIED or READY state.
+                * So it is safe to use atomic_set instead of atomic_cmpxchg.
+                */
+               atomic_set(&entry->state, DM_BHT_ENTRY_VERIFIED);
+       }
+
+       DMDEBUG("verify_path: node %u is verified to root", block);
+       return 0;
+
+mismatch:
+       DMERR_LIMIT("verify_path: failed to verify hash (d=%d,bi=%u)",
+                   depth, block);
+       dm_bht_log_mismatch(bht, node, digest);
+       return DM_BHT_ENTRY_ERROR_MISMATCH;
+}
+
+/**
+ * dm_bht_store_block - sets a given block's hash in the tree
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @block:     numeric index of the block in the tree
+ * @digest:    array of u8s containing the digest of length @bht->digest_size
+ *
+ * Returns 0 on success, >0 when data is pending, and <0 when a IO or other
+ * error has occurred.
+ *
+ * If the containing entry in the tree is unallocated, it will allocate memory
+ * and mark the entry as ready.  All other block entries will be 0s.  This
+ * function is not safe for simultaneous use when verifying data and should not
+ * be used if the @bht is being accessed by any other functions in any other
+ * threads/processes.
+ *
+ * It is expected that virt_to_page will work on |block_data|.
+ */
+int dm_bht_store_block(struct dm_bht *bht, unsigned int block,
+                      u8 *block_data)
+{
+       int depth;
+       unsigned int index;
+       unsigned int node_index;
+       struct dm_bht_entry *entry;
+       struct dm_bht_level *level;
+       int state;
+       struct page *node_page = NULL;
+
+       /* Look at the last level of nodes above the leaves (data blocks) */
+       depth = bht->depth - 1;
+
+       /* Index into the level */
+       level = dm_bht_get_level(bht, depth);
+       index = dm_bht_index_at_level(bht, depth, block);
+       /* Grab the node index into the current entry by getting the
+        * index at the leaf-level.
+        */
+       node_index = dm_bht_index_at_level(bht, depth + 1, block) %
+                    bht->node_count;
+       entry = &level->entries[index];
+
+       DMDEBUG("Storing block %u in d=%d,ei=%u,ni=%u,s=%d",
+               block, depth, index, node_index,
+               atomic_read(&entry->state));
+
+       state = atomic_cmpxchg(&entry->state,
+                              DM_BHT_ENTRY_UNALLOCATED,
+                              DM_BHT_ENTRY_PENDING);
+       /* !!! Note. It is up to the users of the update interface to
+        *     ensure the entry data is fully populated prior to use.
+        *     The number of updated entries is NOT tracked.
+        */
+       if (state == DM_BHT_ENTRY_UNALLOCATED) {
+               node_page = alloc_page(GFP_KERNEL);
+               if (!node_page) {
+                       atomic_set(&entry->state, DM_BHT_ENTRY_ERROR);
+                       return -ENOMEM;
+               }
+               entry->nodes = page_address(node_page);
+               memset(entry->nodes, 0, PAGE_SIZE);
+               /* TODO(wad) could expose this to the caller to that they
+                * can transition from unallocated to ready manually.
+                */
+               atomic_set(&entry->state, DM_BHT_ENTRY_READY);
+       } else if (state <= DM_BHT_ENTRY_ERROR) {
+               DMCRIT("leaf entry for block %u is invalid",
+                     block);
+               return state;
+       } else if (state == DM_BHT_ENTRY_PENDING) {
+               DMERR("leaf data is pending for block %u", block);
+               return 1;
+       }
+
+       dm_bht_compute_hash(bht, virt_to_page(block_data), 0,
+                           dm_bht_node(bht, entry, node_index));
+       return 0;
+}
+EXPORT_SYMBOL(dm_bht_store_block);
+
+/**
+ * dm_bht_zeroread_callback - read callback which always returns 0s
+ * @ctx:       ignored
+ * @start:     ignored
+ * @data:      buffer to write 0s to
+ * @count:     number of sectors worth of data to write
+ * @complete_ctx: opaque context for @completed
+ * @completed: callback to confirm end of data read
+ *
+ * Always returns 0.
+ *
+ * Meant for use by dm_compute() callers.  It allows dm_populate to
+ * be used to pre-fill a tree with zeroed out entry nodes.
+ */
+int dm_bht_zeroread_callback(void *ctx, sector_t start, u8 *dst,
+                            sector_t count, struct dm_bht_entry *entry)
+{
+       memset(dst, 0, to_bytes(count));
+       dm_bht_read_completed(entry, 0);
+       return 0;
+}
+EXPORT_SYMBOL(dm_bht_zeroread_callback);
+
+/**
+ * dm_bht_compute - computes and updates all non-block-level hashes in a tree
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @read_cb_ctx:opaque read_cb context for all I/O on this call
+ *
+ * Returns 0 on success, >0 when data is pending, and <0 when a IO or other
+ * error has occurred.
+ *
+ * Walks the tree and computes the hashes at each level from the
+ * hashes below. This can only be called once per tree creation
+ * since it will mark entries verified. Expects dm_bht_populate() to
+ * correctly populate the tree from the read_callback_stub.
+ *
+ * This function should not be used when verifying the same tree and
+ * should not be used with multiple simultaneous operators on @bht.
+ */
+int dm_bht_compute(struct dm_bht *bht, void *read_cb_ctx)
+{
+       int depth, r = 0;
+
+       for (depth = bht->depth - 2; depth >= 0; depth--) {
+               struct dm_bht_level *level = dm_bht_get_level(bht, depth);
+               struct dm_bht_level *child_level = level + 1;
+               struct dm_bht_entry *entry = level->entries;
+               struct dm_bht_entry *child = child_level->entries;
+               unsigned int i, j;
+
+               for (i = 0; i < level->count; i++, entry++) {
+                       unsigned int count = bht->node_count;
+                       struct page *pg;
+
+                       pg = alloc_page(GFP_NOIO);
+                       if (!pg) {
+                               DMCRIT("an error occurred while reading entry");
+                               goto out;
+                       }
+
+                       entry->nodes = page_address(pg);
+                       memset(entry->nodes, 0, PAGE_SIZE);
+                       atomic_set(&entry->state, DM_BHT_ENTRY_READY);
+
+                       if (i == (level->count - 1))
+                               count = child_level->count % bht->node_count;
+                       if (count == 0)
+                               count = bht->node_count;
+                       for (j = 0; j < count; j++, child++) {
+                               struct page *pg = virt_to_page(child->nodes);
+                               u8 *digest = dm_bht_node(bht, entry, j);
+
+                               r = dm_bht_compute_hash(bht, pg, 0, digest);
+                               if (r) {
+                                       DMERR("Failed to update (d=%d,i=%u)",
+                                             depth, i);
+                                       goto out;
+                               }
+                       }
+               }
+       }
+       r = dm_bht_compute_hash(bht,
+                               virt_to_page(bht->levels[0].entries->nodes),
+                               0, bht->root_digest);
+       if (r)
+               DMERR("Failed to update root hash");
+
+out:
+       return r;
+}
+EXPORT_SYMBOL(dm_bht_compute);
+
+/**
+ * dm_bht_sync - writes the tree in memory to disk
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @write_ctx: callback context for writes issued
+ *
+ * Since all entry nodes are PAGE_SIZE, the data will be pre-aligned and
+ * padded.
+ */
+int dm_bht_sync(struct dm_bht *bht, void *write_cb_ctx)
+{
+       int depth;
+       int ret = 0;
+       int state;
+       sector_t sector;
+       struct dm_bht_level *level;
+       struct dm_bht_entry *entry;
+       struct dm_bht_entry *entry_end;
+
+       for (depth = 0; depth < bht->depth; ++depth) {
+               level = dm_bht_get_level(bht, depth);
+               entry_end = level->entries + level->count;
+               sector = level->sector;
+               for (entry = level->entries; entry < entry_end; ++entry) {
+                       state = atomic_read(&entry->state);
+                       if (state <= DM_BHT_ENTRY_PENDING) {
+                               DMERR("At depth %d, entry %lu is not ready",
+                                     depth,
+                                     (unsigned long)(entry - level->entries));
+                               return state;
+                       }
+                       ret = bht->write_cb(write_cb_ctx,
+                                           sector,
+                                           entry->nodes,
+                                           to_sector(PAGE_SIZE),
+                                           entry);
+                       if (ret) {
+                               DMCRIT("an error occurred writing entry %lu",
+                                     (unsigned long)(entry - level->entries));
+                               return ret;
+                       }
+                       sector += to_sector(PAGE_SIZE);
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(dm_bht_sync);
+
+/**
+ * dm_bht_is_populated - check that entries from disk needed to verify a given
+ *                       block are all ready
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @block:     specific block data is expected from
+ *
+ * Callers may wish to call dm_bht_is_populated() when checking an io
+ * for which entries were already pending.
+ */
+bool dm_bht_is_populated(struct dm_bht *bht, unsigned int block)
+{
+       int depth;
+
+       for (depth = bht->depth - 1; depth >= 0; depth--) {
+               struct dm_bht_entry *entry = dm_bht_get_entry(bht, depth,
+                                                             block);
+               if (atomic_read(&entry->state) < DM_BHT_ENTRY_READY)
+                       return false;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL(dm_bht_is_populated);
+
+/**
+ * dm_bht_populate - reads entries from disk needed to verify a given block
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @ctx:        context used for all read_cb calls on this request
+ * @block:     specific block data is expected from
+ *
+ * Returns negative value on error. Returns 0 on success.
+ */
+int dm_bht_populate(struct dm_bht *bht, void *ctx,
+                   unsigned int block)
+{
+       int depth;
+       int state = 0;
+
+       BUG_ON(block >= bht->block_count);
+
+       DMDEBUG("dm_bht_populate(%u)", block);
+
+       for (depth = bht->depth - 1; depth >= 0; --depth) {
+               struct dm_bht_level *level;
+               struct dm_bht_entry *entry;
+               unsigned int index;
+               struct page *pg;
+
+               entry = dm_bht_get_entry(bht, depth, block);
+               state = atomic_cmpxchg(&entry->state,
+                                      DM_BHT_ENTRY_UNALLOCATED,
+                                      DM_BHT_ENTRY_PENDING);
+
+               if (state == DM_BHT_ENTRY_VERIFIED)
+                       break;
+               if (state <= DM_BHT_ENTRY_ERROR)
+                       goto error_state;
+               if (state != DM_BHT_ENTRY_UNALLOCATED)
+                       continue;
+
+               /* Current entry is claimed for allocation and loading */
+               pg = alloc_page(GFP_NOIO);
+               if (!pg)
+                       goto nomem;
+
+               /* dm-bht guarantees page-aligned memory for callbacks. */
+               entry->nodes = page_address(pg);
+
+               /* TODO(wad) error check callback here too */
+
+               level = &bht->levels[depth];
+               index = dm_bht_index_at_level(bht, depth, block);
+               bht->read_cb(ctx, level->sector + to_sector(index * PAGE_SIZE),
+                            entry->nodes, to_sector(PAGE_SIZE), entry);
+       }
+
+       return 0;
+
+error_state:
+       DMCRIT("block %u at depth %d is in an error state", block, depth);
+       return state;
+
+nomem:
+       DMCRIT("failed to allocate memory for entry->nodes");
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(dm_bht_populate);
+
+
+/**
+ * dm_bht_verify_block - checks that all nodes in the path for @block are valid
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @block:     specific block data is expected from
+ * @pg:                page holding the block data
+ * @offset:    offset into the page
+ *
+ * Returns 0 on success, 1 on missing data, and a negative error
+ * code on verification failure. All supporting functions called
+ * should return similarly.
+ */
+int dm_bht_verify_block(struct dm_bht *bht, unsigned int block,
+                       struct page *pg, unsigned int offset)
+{
+       BUG_ON(offset != 0);
+
+       return  dm_bht_verify_path(bht, block, pg, offset);
+}
+EXPORT_SYMBOL(dm_bht_verify_block);
+
+/**
+ * dm_bht_destroy - cleans up all memory used by @bht
+ * @bht:       pointer to a dm_bht_create()d bht
+ *
+ * Returns 0 on success. Does not free @bht itself.
+ */
+int dm_bht_destroy(struct dm_bht *bht)
+{
+       int depth;
+       int cpu = 0;
+
+       depth = bht->depth;
+       while (depth-- != 0) {
+               struct dm_bht_entry *entry = bht->levels[depth].entries;
+               struct dm_bht_entry *entry_end = entry +
+                                                bht->levels[depth].count;
+               int state = 0;
+               for (; entry < entry_end; ++entry) {
+                       state = atomic_read(&entry->state);
+                       switch (state) {
+                       /* At present, no other states free memory,
+                        * but that will change.
+                        */
+                       case DM_BHT_ENTRY_UNALLOCATED:
+                               /* Allocated with improper state */
+                               BUG_ON(entry->nodes);
+                               continue;
+                       default:
+                               BUG_ON(!entry->nodes);
+                               __free_page(virt_to_page(entry->nodes));
+                               break;
+                       }
+               }
+               kfree(bht->levels[depth].entries);
+               bht->levels[depth].entries = NULL;
+       }
+       kfree(bht->levels);
+       for (cpu = 0; cpu < nr_cpu_ids; ++cpu)
+               if (bht->hash_desc[cpu].tfm)
+                       crypto_free_hash(bht->hash_desc[cpu].tfm);
+       return 0;
+}
+EXPORT_SYMBOL(dm_bht_destroy);
+
+/*-----------------------------------------------
+ * Accessors
+ *-----------------------------------------------*/
+
+/**
+ * dm_bht_sectors - return the sectors required on disk
+ * @bht:       pointer to a dm_bht_create()d bht
+ */
+sector_t dm_bht_sectors(const struct dm_bht *bht)
+{
+       return bht->sectors;
+}
+EXPORT_SYMBOL(dm_bht_sectors);
+
+/**
+ * dm_bht_set_read_cb - set read callback
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @read_cb:   callback function used for all read requests by @bht
+ */
+void dm_bht_set_read_cb(struct dm_bht *bht, dm_bht_callback read_cb)
+{
+       bht->read_cb = read_cb;
+}
+EXPORT_SYMBOL(dm_bht_set_read_cb);
+
+/**
+ * dm_bht_set_write_cb - set write callback
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @write_cb:  callback function used for all write requests by @bht
+ */
+void dm_bht_set_write_cb(struct dm_bht *bht, dm_bht_callback write_cb)
+{
+       bht->write_cb = write_cb;
+}
+EXPORT_SYMBOL(dm_bht_set_write_cb);
+
+/**
+ * dm_bht_set_root_hexdigest - sets an unverified root digest hash from hex
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @hexdigest: array of u8s containing the new digest in binary
+ * Returns non-zero on error.  hexdigest should be NUL terminated.
+ */
+int dm_bht_set_root_hexdigest(struct dm_bht *bht, const u8 *hexdigest)
+{
+       /* Make sure we have at least the bytes expected */
+       if (strnlen((char *)hexdigest, bht->digest_size * 2) !=
+           bht->digest_size * 2) {
+               DMERR("root digest length does not match hash algorithm");
+               return -1;
+       }
+       dm_bht_hex_to_bin(bht->root_digest, hexdigest, bht->digest_size);
+#ifdef CONFIG_DM_DEBUG
+       DMINFO("Set root digest to %s. Parsed as -> ", hexdigest);
+       dm_bht_log_mismatch(bht, bht->root_digest, bht->root_digest);
+#endif
+       return 0;
+}
+EXPORT_SYMBOL(dm_bht_set_root_hexdigest);
+
+/**
+ * dm_bht_root_hexdigest - returns root digest in hex
+ * @bht:       pointer to a dm_bht_create()d bht
+ * @hexdigest: u8 array of size @available
+ * @available: must be bht->digest_size * 2 + 1
+ */
+int dm_bht_root_hexdigest(struct dm_bht *bht, u8 *hexdigest, int available)
+{
+       if (available < 0 ||
+           ((unsigned int) available) < bht->digest_size * 2 + 1) {
+               DMERR("hexdigest has too few bytes available");
+               return -EINVAL;
+       }
+       dm_bht_bin_to_hex(bht->root_digest, hexdigest, bht->digest_size);
+       return 0;
+}
+EXPORT_SYMBOL(dm_bht_root_hexdigest);
+
+/**
+ * dm_bht_set_salt - sets the salt used, in hex
+ * @bht:      pointer to a dm_bht_create()d bht
+ * @hexsalt:  salt string, as hex; will be zero-padded or truncated to
+ *            DM_BHT_SALT_SIZE * 2 hex digits.
+ */
+void dm_bht_set_salt(struct dm_bht *bht, const char *hexsalt)
+{
+       size_t saltlen = min(strlen(hexsalt) / 2, sizeof(bht->salt));
+       bht->have_salt = true;
+       memset(bht->salt, 0, sizeof(bht->salt));
+       dm_bht_hex_to_bin(bht->salt, (const u8 *)hexsalt, saltlen);
+}
+
+/**
+ * dm_bht_salt - returns the salt used, in hex
+ * @bht:      pointer to a dm_bht_create()d bht
+ * @hexsalt:  buffer to put salt into, of length DM_BHT_SALT_SIZE * 2 + 1.
+ */
+int dm_bht_salt(struct dm_bht *bht, char *hexsalt)
+{
+       if (!bht->have_salt)
+               return -EINVAL;
+       dm_bht_bin_to_hex(bht->salt, (u8 *)hexsalt, sizeof(bht->salt));
+       return 0;
+}
index a1a3e6d..dcf91c4 100644 (file)
@@ -1749,6 +1749,45 @@ void dm_interface_exit(void)
        dm_hash_exit();
 }
 
+
+/**
+ * dm_ioctl_export - Permanently export a mapped device via the ioctl interface
+ * @md: Pointer to mapped_device
+ * @name: Buffer (size DM_NAME_LEN) for name
+ * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+                   const char *uuid)
+{
+       int r = 0;
+       struct hash_cell *hc;
+
+       if (!md) {
+               r = -ENXIO;
+               goto out;
+       }
+
+       /* The name and uuid can only be set once. */
+       mutex_lock(&dm_hash_cells_mutex);
+       hc = dm_get_mdptr(md);
+       mutex_unlock(&dm_hash_cells_mutex);
+       if (hc) {
+               DMERR("%s: already exported", dm_device_name(md));
+               r = -ENXIO;
+               goto out;
+       }
+
+       r = dm_hash_insert(name, uuid, md);
+       if (r) {
+               DMERR("%s: could not bind to '%s'", dm_device_name(md), name);
+               goto out;
+       }
+
+       /* Let udev know we've changed. */
+       dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md));
+out:
+       return r;
+}
 /**
  * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers
  * @md: Pointer to mapped_device
index d54ca6c..d1752fe 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/namei.h>
+#include <linux/mount.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/slab.h>
@@ -1427,6 +1428,30 @@ static bool dm_table_is_nonrot(struct dm_table *t)
        return 1;
 }
 
+static int device_nonrot(struct dm_target *ti, struct dm_dev *dev,
+                              sector_t start, sector_t len, void *data)
+{
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return q && blk_queue_nonrot(q);
+}
+
+static bool dm_table_all_nonrot(struct dm_table *t)
+{
+       unsigned i = 0;
+
+       /* Ensure that all underlying device are non rotational. */
+       while (i < dm_table_get_num_targets(t)) {
+               struct dm_target *ti = dm_table_get_target(t, i++);
+
+               if (!ti->type->iterate_devices ||
+                   !ti->type->iterate_devices(ti, device_nonrot, NULL))
+                       return false;
+       }
+
+       return true;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                               struct queue_limits *limits)
 {
@@ -1441,6 +1466,10 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
        else
                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+       if (!dm_table_all_nonrot(t))
+               queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
+       else
+               queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
 
        if (dm_table_supports_flush(t, REQ_FLUSH)) {
                flush |= REQ_FLUSH;
diff --git a/drivers/md/dm-verity-chromeos.c b/drivers/md/dm-verity-chromeos.c
new file mode 100644 (file)
index 0000000..dceca8f
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *                    All Rights Reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Implements a Chrome OS platform specific error handler.
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/chromeos_platform.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/err.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <asm/page.h>
+
+#include "dm-verity.h"
+
+#define DM_MSG_PREFIX "verity-chromeos"
+
+static void chromeos_invalidate_kernel_endio(struct bio *bio, int err)
+{
+       const char *mode = ((bio->bi_rw & REQ_WRITE) ? "write" : "read");
+       if (err)
+               chromeos_set_need_recovery();
+
+       if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+               DMERR("invalidate_kernel: %s not supported", mode);
+               chromeos_set_need_recovery();
+       } else if (!bio_flagged(bio, BIO_UPTODATE)) {
+               DMERR("invalidate_kernel: %s not up to date", mode);
+               chromeos_set_need_recovery();
+       } else {
+               DMERR("invalidate_kernel: partition header %s completed", mode);
+       }
+
+       complete(bio->bi_private);
+}
+
+static int chromeos_invalidate_kernel_submit(struct bio *bio,
+                                            struct block_device *bdev,
+                                            int rw, struct page *page)
+{
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       bio->bi_private = &wait;
+       bio->bi_end_io = chromeos_invalidate_kernel_endio;
+       bio->bi_bdev = bdev;
+
+       bio->bi_sector = 0;
+       bio->bi_vcnt = 1;
+       bio->bi_idx = 0;
+       bio->bi_size = 512;
+       bio->bi_rw = rw;
+       bio->bi_io_vec[0].bv_page = page;
+       bio->bi_io_vec[0].bv_len = 512;
+       bio->bi_io_vec[0].bv_offset = 0;
+
+       submit_bio(rw, bio);
+       /* Wait up to 2 seconds for completion or fail. */
+       if (!wait_for_completion_timeout(&wait, msecs_to_jiffies(2000)))
+               return -1;
+       return 0;
+}
+
+/* Replaces the first 8 bytes of a partition with DMVERROR */
+static int chromeos_invalidate_kernel(struct block_device *root_bdev)
+{
+       int ret = 0;
+       struct block_device *bdev;
+       struct bio *bio;
+       struct page *page;
+       int partno = root_bdev->bd_part->partno - 1;
+       dev_t kdev = MKDEV(0, 0);
+       fmode_t dev_mode;
+       /* Ensure we do synchronous unblocked I/O. We may also need
+        * sync_bdev() on completion, but it really shouldn't.
+        */
+       int rw = REQ_SYNC | REQ_SOFTBARRIER | REQ_NOIDLE;
+
+       /* Very basic sanity checking. This should be better. */
+       if (!root_bdev || !root_bdev->bd_part ||
+           root_bdev->bd_part->partno <= 1) {
+               DMERR("invalidate_kernel: partition layout unexpected");
+               return -EINVAL;
+       }
+       kdev = MKDEV(MAJOR(root_bdev->bd_dev), MINOR(root_bdev->bd_dev) - 1);
+
+       DMERR("Attempting to invalidate kernel (part:%d,devt:%d)",
+             partno, kdev);
+
+       /* First we open the device for reading. */
+       dev_mode = FMODE_READ | FMODE_EXCL;
+       bdev = blkdev_get_by_dev(kdev, dev_mode, chromeos_invalidate_kernel);
+       if (IS_ERR(bdev)) {
+               DMERR("invalidate_kernel: could not open device for reading");
+               ret = -1;
+               goto failed_to_read;
+       }
+
+       bio = bio_alloc(GFP_NOIO, 1);
+       if (!bio) {
+               ret = -1;
+               goto failed_bio_alloc;
+       }
+
+       page = alloc_page(GFP_NOIO);
+       if (!page) {
+               ret = -ENOMEM;
+               goto failed_to_alloc_page;
+       }
+
+       if (chromeos_invalidate_kernel_submit(bio, bdev, rw, page)) {
+               ret = -1;
+               goto failed_to_submit_read;
+       }
+
+       /* We have a page. Let's make sure it looks right. */
+       if (memcmp("CHROMEOS", page_address(page), 8)) {
+               DMERR("invalidate_kernel called on non-kernel partition");
+               ret = -EINVAL;
+               goto invalid_header;
+       } else {
+               DMERR("invalidate_kernel: found CHROMEOS kernel partition");
+       }
+
+       /* Stamp it and rewrite */
+       memcpy(page_address(page), "DMVERROR", 8);
+
+       /* The block dev was being changed on read. Let's reopen here. */
+       blkdev_put(bdev, dev_mode);
+       dev_mode = FMODE_WRITE | FMODE_EXCL;
+       bdev = blkdev_get_by_dev(kdev, dev_mode, chromeos_invalidate_kernel);
+       if (IS_ERR(bdev)) {
+               DMERR("invalidate_kernel: could not open device for reading");
+               dev_mode = 0;
+               ret = -1;
+               goto failed_to_write;
+       }
+
+       rw |= REQ_WRITE;
+       if (chromeos_invalidate_kernel_submit(bio, bdev, rw, page)) {
+               ret = -1;
+               goto failed_to_submit_write;
+       }
+
+       DMERR("invalidate_kernel: completed.");
+       ret = 0;
+failed_to_submit_write:
+failed_to_write:
+invalid_header:
+       __free_page(page);
+failed_to_submit_read:
+       /* Technically, we'll leak a page with the pending bio, but
+        *  we're about to panic so it's safer to do the panic() we expect.
+        */
+failed_to_alloc_page:
+       bio_put(bio);
+failed_bio_alloc:
+       if (dev_mode)
+               blkdev_put(bdev, dev_mode);
+failed_to_read:
+       return ret;
+}
+
+static int error_handler(struct notifier_block *nb, unsigned long transient,
+                        void *opaque_err)
+{
+       struct dm_verity_error_state *err =
+               (struct dm_verity_error_state *) opaque_err;
+       err->behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+       if (transient)
+               return 0;
+
+       /* TODO(wad) Implement phase 2:
+        * - Attempt to read the dev_status_offset from the hash dev.
+        * - If the status offset is 0, replace the first byte of the sector
+        *   with 01 and panic().
+        * - If the status offset is not 0, invalidate the associated kernel
+        *   partition, then reboot.
+        * - make user space tools clear the last sector
+        */
+       if (chromeos_invalidate_kernel(err->dev))
+               chromeos_set_need_recovery();
+       return 0;
+}
+
+static struct notifier_block chromeos_nb = {
+       .notifier_call = &error_handler,
+       .next = NULL,
+       .priority = 1,
+};
+
+static int __init dm_verity_chromeos_init(void)
+{
+       int r;
+
+       r = dm_verity_register_error_notifier(&chromeos_nb);
+       if (r < 0)
+               DMERR("failed to register handler: %d", r);
+       else
+               DMINFO("dm-verity-chromeos registered");
+       return r;
+}
+
+static void __exit dm_verity_chromeos_exit(void)
+{
+       dm_verity_unregister_error_notifier(&chromeos_nb);
+}
+
+module_init(dm_verity_chromeos_init);
+module_exit(dm_verity_chromeos_exit);
+
+MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
+MODULE_DESCRIPTION("chromeos-specific error handler for dm-verity");
+MODULE_LICENSE("GPL");
index fa365d3..5939df8 100644 (file)
 /*
- * Copyright (C) 2012 Red Hat, Inc.
+ * Originally based on dm-crypt.c,
+ * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *                    All Rights Reserved.
  *
- * Author: Mikulas Patocka <mpatocka@redhat.com>
+ * This file is released under the GPL.
  *
- * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
- *
- * This file is released under the GPLv2.
- *
- * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
- * default prefetch value. Data are read in "prefetch_cluster" chunks from the
- * hash device. Setting this greatly improves performance when data and hash
- * are on the same disk on different partitions on devices with poor random
- * access behavior.
+ * Implements a verifying transparent block device.
+ * See Documentation/device-mapper/dm-verity.txt
  */
-
-#include "dm-bufio.h"
-
+#include <linux/async.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/genhd.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
 #include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <asm/atomic.h>
+#include <asm/page.h>
+
+/* #define CONFIG_DM_DEBUG 1 */
+#define CONFIG_DM_VERITY_TRACE 1
 #include <linux/device-mapper.h>
-#include <crypto/hash.h>
+#include <linux/dm-bht.h>
 
-#define DM_MSG_PREFIX                  "verity"
+#include "dm-verity.h"
+#include "md.h"
 
-#define DM_VERITY_IO_VEC_INLINE                16
-#define DM_VERITY_MEMPOOL_SIZE         4
-#define DM_VERITY_DEFAULT_PREFETCH_SIZE        262144
+#define DM_MSG_PREFIX "verity"
 
-#define DM_VERITY_MAX_LEVELS           63
+/* Supports up to 512-bit digests */
+#define VERITY_MAX_DIGEST_SIZE 64
 
-static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
+/* TODO(wad) make both of these report the error line/file to a
+ *           verity_bug function.
+ */
+#define VERITY_BUG(msg...) BUG()
+#define VERITY_BUG_ON(cond, msg...) BUG_ON(cond)
+
+/* Helper for printing sector_t */
+#define ULL(x) ((unsigned long long)(x))
+
+/* IOS represent min of dm_verity_ios in a pool, but we also use it to
+ * preallocate biosets (MIN_IOS * 2):
+ * 1. We need to clone the entire bioset, including bio_vecs, before passing
+ *    them to the underlying block layer since it may alter the values.
+ * 2. We need to pad out biosets that are not block aligned.
+ * 3. We need to be able to create biosets while loading in hashes.
+ * This will need more tweaking for specific workload expectations.
+ */
+#define MIN_IOS 32
+/* During io_bht_read, we will spawn _many_ bios for a single I/O early on, but
+ * once the tree is populated, we will only need MIN_IOS at most to be able to
+ * pad out the request. We will also need space for the padding biovecs which
+ * is at most 2, less than one page per side.
+ */
+#define MIN_BIOS (MIN_IOS * 2)
 
-module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
+/* MUST be true: SECTOR_SHIFT <= VERITY_BLOCK_SHIFT <= PAGE_SHIFT */
+#define VERITY_BLOCK_SIZE 4096
+#define VERITY_BLOCK_SHIFT 12
 
-struct dm_verity {
-       struct dm_dev *data_dev;
-       struct dm_dev *hash_dev;
-       struct dm_target *ti;
-       struct dm_bufio_client *bufio;
-       char *alg_name;
-       struct crypto_shash *tfm;
-       u8 *root_digest;        /* digest of the root block */
-       u8 *salt;               /* salt: its size is salt_size */
-       unsigned salt_size;
-       sector_t data_start;    /* data offset in 512-byte sectors */
-       sector_t hash_start;    /* hash start in blocks */
-       sector_t data_blocks;   /* the number of data blocks */
-       sector_t hash_blocks;   /* the number of hash blocks */
-       unsigned char data_dev_block_bits;      /* log2(data blocksize) */
-       unsigned char hash_dev_block_bits;      /* log2(hash blocksize) */
-       unsigned char hash_per_block_bits;      /* log2(hashes in hash block) */
-       unsigned char levels;   /* the number of tree levels */
-       unsigned char version;
-       unsigned digest_size;   /* digest size for the current hash algorithm */
-       unsigned shash_descsize;/* the size of temporary space for crypto */
-       int hash_failed;        /* set to 1 if hash of any block failed */
-
-       mempool_t *io_mempool;  /* mempool of struct dm_verity_io */
-       mempool_t *vec_mempool; /* mempool of bio vector */
-
-       struct workqueue_struct *verify_wq;
-
-       /* starting blocks for each tree level. 0 is the lowest level. */
-       sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+/* Support additional tracing of requests */
+#ifdef CONFIG_DM_VERITY_TRACE
+#define VERITY_TRACE(param, fmt, args...) { \
+       if (param) \
+               DMINFO(fmt, ## args); \
+}
+static int request_trace;
+module_param(request_trace, bool, 0644);
+MODULE_PARM_DESC(request_trace, "Enable request tracing to DMINFO");
+
+static int alloc_trace;
+module_param(alloc_trace, bool, 0644);
+MODULE_PARM_DESC(alloc_trace, "Enable allocation tracing to DMINFO");
+#else
+#define VERITY_TRACE(...)
+#endif
+
+#define REQTRACE(fmt, args...) VERITY_TRACE(request_trace, "req: " fmt, ## args)
+#define ALLOCTRACE(fmt, args...) \
+       VERITY_TRACE(alloc_trace, "alloc: " fmt, ## args)
+
+/* Provide a lightweight means of specifying the global default for
+ * error behavior: eio, reboot, or none
+ * Legacy support for 0 = eio, 1 = reboot/panic, 2 = none, 3 = notify.
+ * This is matched to the enum in dm-verity.h.
+ */
+static const char *allowed_error_behaviors[] = { "eio", "panic", "none",
+                                                "notify", NULL };
+static char *error_behavior = "eio";
+module_param(error_behavior, charp, 0644);
+MODULE_PARM_DESC(error_behavior, "Behavior on error "
+                                "(eio, panic, none, notify)");
+
+/* Controls whether verity_get_device will wait forever for a device. */
+static int dev_wait;
+module_param(dev_wait, bool, 0444);
+MODULE_PARM_DESC(dev_wait, "Wait forever for a backing device");
+
+/* Used for tracking pending bios as well as for exporting information via
+ * STATUSTYPE_INFO.
+ */
+struct verity_stats {
+       unsigned int io_queue;          /* # pending I/O operations */
+       unsigned int verify_queue;      /* # pending verify operations */
+       unsigned int average_requeues;  /* not implemented */
+
+       /*
+        * Number of times a data block was ready but we didn't have the hash
+        * blocks for it yet */
+       unsigned long long total_requeues;
+       unsigned long long total_requests;      /* number of reads */
+
+       unsigned long long total_blocks;        /* total blocks read */
+       unsigned long long total_size;  /* total blocks read */
+
+       unsigned long bht_requests;     /* number of hash blocks read */
+
+       /* number of reads for each block size (log2) */
+       unsigned long io_by_block_size[sizeof(uint64_t) * 8];
+       unsigned long long io_size_by_block_size[sizeof(uint64_t) * 8];
+};
+
+/* per-requested-bio private data */
+enum verity_io_flags {
+       VERITY_IOFLAGS_CLONED = 0x1,    /* original bio has been cloned */
 };
 
 struct dm_verity_io {
-       struct dm_verity *v;
+       struct dm_target *target;
        struct bio *bio;
+       struct delayed_work work;
+       unsigned int flags;
 
-       /* original values of bio->bi_end_io and bio->bi_private */
-       bio_end_io_t *orig_bi_end_io;
-       void *orig_bi_private;
+       int error;
+       atomic_t pending;
 
-       sector_t block;
-       unsigned n_blocks;
+       sector_t sector;  /* converted to target sector */
+       u64 block;  /* aligned block index */
+       u64 count;  /* aligned count in blocks */
+};
 
-       /* saved bio vector */
-       struct bio_vec *io_vec;
-       unsigned io_vec_size;
+struct verity_config {
+       struct dm_dev *dev;
+       sector_t start;
+       sector_t size;
 
-       struct work_struct work;
+       struct dm_dev *hash_dev;
+       sector_t hash_start;
 
-       /* A space for short vectors; longer vectors are allocated separately. */
-       struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
+       struct dm_bht bht;
 
-       /*
-        * Three variably-size fields follow this struct:
-        *
-        * u8 hash_desc[v->shash_descsize];
-        * u8 real_digest[v->digest_size];
-        * u8 want_digest[v->digest_size];
-        *
-        * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
+       /* Pool required for io contexts */
+       mempool_t *io_pool;
+       /* Pool and bios required for making sure that backing device reads are
+        * in PAGE_SIZE increments.
         */
+       struct bio_set *bs;
+
+       char hash_alg[CRYPTO_MAX_ALG_NAME];
+
+       int error_behavior;
+
+       struct verity_stats stats;
+       const char *name;               /* name for this config */
+       struct dentry *debugfs_dir;     /* debugfs dir for this config */
 };
 
-static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
+static struct kmem_cache *_verity_io_pool;
+static struct workqueue_struct *kveritydq, *kverityd_ioq;
+static struct dentry *debugfs_root; /* top-level debugfs dir for verity */
+
+static void kverityd_verify(struct work_struct *work);
+static void kverityd_io(struct work_struct *work);
+static void kverityd_io_bht_populate(struct dm_verity_io *io);
+static void kverityd_io_bht_populate_end(struct bio *, int error);
+
+static BLOCKING_NOTIFIER_HEAD(verity_error_notifier);
+
+/*-----------------------------------------------
+ * Statistic tracking functions
+ *-----------------------------------------------*/
+
+void verity_stats_io_queue_inc(struct verity_config *vc)
 {
-       return (struct shash_desc *)(io + 1);
+       vc->stats.io_queue++;
 }
 
-static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
+void verity_stats_verify_queue_inc(struct verity_config *vc)
 {
-       return (u8 *)(io + 1) + v->shash_descsize;
+       vc->stats.verify_queue++;
 }
 
-static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
+void verity_stats_io_queue_dec(struct verity_config *vc)
 {
-       return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+       vc->stats.io_queue--;
 }
 
-/*
- * Auxiliary structure appended to each dm-bufio buffer. If the value
- * hash_verified is nonzero, hash of the block has been verified.
- *
- * The variable hash_verified is set to 0 when allocating the buffer, then
- * it can be changed to 1 and it is never reset to 0 again.
- *
- * There is no lock around this value, a race condition can at worst cause
- * that multiple processes verify the hash of the same buffer simultaneously
- * and write 1 to hash_verified simultaneously.
- * This condition is harmless, so we don't need locking.
- */
-struct buffer_aux {
-       int hash_verified;
-};
+void verity_stats_verify_queue_dec(struct verity_config *vc)
+{
+       vc->stats.verify_queue--;
+}
 
-/*
- * Initialize struct buffer_aux for a freshly created buffer.
- */
-static void dm_bufio_alloc_callback(struct dm_buffer *buf)
+void verity_stats_total_requeues_inc(struct verity_config *vc)
 {
-       struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+       vc->stats.total_requeues++;
+}
 
-       aux->hash_verified = 0;
+void verity_stats_total_requests_inc(struct verity_config *vc)
+{
+       vc->stats.total_requests++;
 }
 
-/*
- * Translate input sector number to the sector number on the target device.
- */
-static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
+void verity_stats_average_requeues(struct verity_config *vc, int requeues)
 {
-       return v->data_start + dm_target_offset(v->ti, bi_sector);
+       /* TODO(wad) */
 }
 
-/*
- * Return hash position of a specified block at a specified tree level
- * (0 is the lowest level).
- * The lowest "hash_per_block_bits"-bits of the result denote hash position
- * inside a hash block. The remaining bits denote location of the hash block.
- */
-static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
-                                        int level)
+/*-----------------------------------------------
+ * Exported interfaces
+ *-----------------------------------------------*/
+
+int dm_verity_register_error_notifier(struct notifier_block *nb)
 {
-       return block >> (level * v->hash_per_block_bits);
+       return blocking_notifier_chain_register(&verity_error_notifier, nb);
 }
+EXPORT_SYMBOL_GPL(dm_verity_register_error_notifier);
 
-static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
-                                sector_t *hash_block, unsigned *offset)
+int dm_verity_unregister_error_notifier(struct notifier_block *nb)
 {
-       sector_t position = verity_position_at_level(v, block, level);
-       unsigned idx;
+       return blocking_notifier_chain_unregister(&verity_error_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(dm_verity_unregister_error_notifier);
 
-       *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
+/*-----------------------------------------------
+ * Allocation and utility functions
+ *-----------------------------------------------*/
 
-       if (!offset)
-               return;
+static void kverityd_src_io_read_end(struct bio *clone, int error);
 
-       idx = position & ((1 << v->hash_per_block_bits) - 1);
-       if (!v->version)
-               *offset = idx * v->digest_size;
-       else
-               *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
+/* Shared destructor for all internal bios */
+static void dm_verity_bio_destructor(struct bio *bio)
+{
+       struct dm_verity_io *io = bio->bi_private;
+       struct verity_config *vc = io->target->private;
+       bio_free(bio, vc->bs);
 }
 
-/*
- * Verify hash of a metadata block pertaining to the specified data block
- * ("block" argument) at a specified level ("level" argument).
- *
- * On successful return, io_want_digest(v, io) contains the hash value for
- * a lower tree level or for the data block (if we're at the lowest leve).
- *
- * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
- * If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of io_want_digest(v, io).
- */
-static int verity_verify_level(struct dm_verity_io *io, sector_t block,
-                              int level, bool skip_unverified)
+struct bio *verity_alloc_bioset(struct verity_config *vc, gfp_t gfp_mask,
+                               int nr_iovecs)
 {
-       struct dm_verity *v = io->v;
-       struct dm_buffer *buf;
-       struct buffer_aux *aux;
-       u8 *data;
-       int r;
-       sector_t hash_block;
-       unsigned offset;
+       return bio_alloc_bioset(gfp_mask, nr_iovecs, vc->bs);
+}
 
-       verity_hash_at_level(v, block, level, &hash_block, &offset);
+static struct dm_verity_io *verity_io_alloc(struct dm_target *ti,
+                                           struct bio *bio, sector_t sector)
+{
+       struct verity_config *vc = ti->private;
+       struct dm_verity_io *io;
 
-       data = dm_bufio_read(v->bufio, hash_block, &buf);
-       if (unlikely(IS_ERR(data)))
-               return PTR_ERR(data);
+       ALLOCTRACE("dm_verity_io for sector %llu", ULL(sector));
+       io = mempool_alloc(vc->io_pool, GFP_NOIO);
+       if (unlikely(!io))
+               return NULL;
+       io->flags = 0;
+       io->target = ti;
+       io->bio = bio;
+       io->sector = sector;
+       io->error = 0;
 
-       aux = dm_bufio_get_aux_data(buf);
+       /* Adjust the sector by the virtual starting sector */
+       io->block = (to_bytes(sector)) >> VERITY_BLOCK_SHIFT;
+       io->count = bio->bi_size >> VERITY_BLOCK_SHIFT;
 
-       if (!aux->hash_verified) {
-               struct shash_desc *desc;
-               u8 *result;
+       DMDEBUG("io_alloc for %llu blocks starting at %llu",
+               ULL(io->count), ULL(io->block));
 
-               if (skip_unverified) {
-                       r = 1;
-                       goto release_ret_r;
-               }
+       atomic_set(&io->pending, 0);
 
-               desc = io_hash_desc(v, io);
-               desc->tfm = v->tfm;
-               desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-               r = crypto_shash_init(desc);
-               if (r < 0) {
-                       DMERR("crypto_shash_init failed: %d", r);
-                       goto release_ret_r;
-               }
+       return io;
+}
 
-               if (likely(v->version >= 1)) {
-                       r = crypto_shash_update(desc, v->salt, v->salt_size);
-                       if (r < 0) {
-                               DMERR("crypto_shash_update failed: %d", r);
-                               goto release_ret_r;
-                       }
-               }
+static struct bio *verity_bio_clone(struct dm_verity_io *io)
+{
+       struct verity_config *vc = io->target->private;
+       struct bio *bio = io->bio;
+       struct bio *clone = verity_alloc_bioset(vc, GFP_NOIO, bio->bi_max_vecs);
 
-               r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
-               if (r < 0) {
-                       DMERR("crypto_shash_update failed: %d", r);
-                       goto release_ret_r;
-               }
+       if (!clone)
+               return NULL;
 
-               if (!v->version) {
-                       r = crypto_shash_update(desc, v->salt, v->salt_size);
-                       if (r < 0) {
-                               DMERR("crypto_shash_update failed: %d", r);
-                               goto release_ret_r;
-                       }
-               }
+       __bio_clone(clone, bio);
+       clone->bi_private = io;
+       clone->bi_end_io  = kverityd_src_io_read_end;
+       clone->bi_bdev    = vc->dev->bdev;
+       clone->bi_sector  = vc->start + io->sector;
+       clone->bi_destructor = dm_verity_bio_destructor;
 
-               result = io_real_digest(v, io);
-               r = crypto_shash_final(desc, result);
-               if (r < 0) {
-                       DMERR("crypto_shash_final failed: %d", r);
-                       goto release_ret_r;
-               }
-               if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                       DMERR_LIMIT("metadata block %llu is corrupted",
-                               (unsigned long long)hash_block);
-                       v->hash_failed = 1;
-                       r = -EIO;
-                       goto release_ret_r;
-               } else
-                       aux->hash_verified = 1;
+       return clone;
+}
+
+/* If the request is not successful, this handler takes action.
+ * TODO make this call a registered handler.
+ */
+static void verity_error(struct verity_config *vc, struct dm_verity_io *io,
+                        int error)
+{
+       const char *message;
+       int error_behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+       dev_t devt = 0;
+       u64 block = ~0;
+       int transient = 1;
+       struct dm_verity_error_state error_state;
+
+       if (vc) {
+               devt = vc->dev->bdev->bd_dev;
+               error_behavior = vc->error_behavior;
        }
 
-       data += offset;
+       if (io) {
+               io->error = -EIO;
+               block = io->block;
+       }
 
-       memcpy(io_want_digest(v, io), data, v->digest_size);
+       switch (error) {
+       case -ENOMEM:
+               message = "out of memory";
+               break;
+       case -EBUSY:
+               message = "pending data seen during verify";
+               break;
+       case -EFAULT:
+               message = "crypto operation failure";
+               break;
+       case -EACCES:
+               message = "integrity failure";
+               /* Image is bad. */
+               transient = 0;
+               break;
+       case -EPERM:
+               message = "hash tree population failure";
+               /* Should be dm-bht specific errors */
+               transient = 0;
+               break;
+       case -EINVAL:
+               message = "unexpected missing/invalid data";
+               /* The device was configured incorrectly - fallback. */
+               transient = 0;
+               break;
+       default:
+               /* Other errors can be passed through as IO errors */
+               message = "unknown or I/O error";
+               return;
+       }
 
-       dm_bufio_release(buf);
-       return 0;
+       DMERR_LIMIT("verification failure occurred: %s", message);
+
+       if (error_behavior == DM_VERITY_ERROR_BEHAVIOR_NOTIFY) {
+               error_state.code = error;
+               error_state.transient = transient;
+               error_state.block = block;
+               error_state.message = message;
+               error_state.dev_start = vc->start;
+               error_state.dev_len = vc->size;
+               error_state.dev = vc->dev->bdev;
+               error_state.hash_dev_start = vc->hash_start;
+               error_state.hash_dev_len = dm_bht_sectors(&vc->bht);
+               error_state.hash_dev = vc->hash_dev->bdev;
+
+               /* Set default fallthrough behavior. */
+               error_state.behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+               error_behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+
+               if (!blocking_notifier_call_chain(
+                   &verity_error_notifier, transient, &error_state)) {
+                       error_behavior = error_state.behavior;
+               }
+       }
 
-release_ret_r:
-       dm_bufio_release(buf);
+       switch (error_behavior) {
+       case DM_VERITY_ERROR_BEHAVIOR_EIO:
+               break;
+       case DM_VERITY_ERROR_BEHAVIOR_NONE:
+               if (error != -EIO && io)
+                       io->error = 0;
+               break;
+       default:
+               goto do_panic;
+       }
+       return;
 
-       return r;
+do_panic:
+       panic("dm-verity failure: "
+             "device:%u:%u error:%d block:%llu message:%s",
+             MAJOR(devt), MINOR(devt), error, ULL(block), message);
 }
 
-/*
- * Verify one "dm_verity_io" structure.
+/**
+ * verity_parse_error_behavior - parse a behavior charp to the enum
+ * @behavior:  NUL-terminated char array
+ *
+ * Checks if the behavior is valid either as text or as an index digit
+ * and returns the proper enum value or -1 on error.
  */
-static int verity_verify_io(struct dm_verity_io *io)
+static int verity_parse_error_behavior(const char *behavior)
 {
-       struct dm_verity *v = io->v;
-       unsigned b;
-       int i;
-       unsigned vector = 0, offset = 0;
-
-       for (b = 0; b < io->n_blocks; b++) {
-               struct shash_desc *desc;
-               u8 *result;
-               int r;
-               unsigned todo;
-
-               if (likely(v->levels)) {
-                       /*
-                        * First, we try to get the requested hash for
-                        * the current block. If the hash block itself is
-                        * verified, zero is returned. If it isn't, this
-                        * function returns 0 and we fall back to whole
-                        * chain verification.
-                        */
-                       int r = verity_verify_level(io, io->block + b, 0, true);
-                       if (likely(!r))
-                               goto test_block_hash;
-                       if (r < 0)
-                               return r;
-               }
+       const char **allowed = allowed_error_behaviors;
+       char index = '0';
 
-               memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
+       for (; *allowed; allowed++, index++)
+               if (!strcmp(*allowed, behavior) || behavior[0] == index)
+                       break;
 
-               for (i = v->levels - 1; i >= 0; i--) {
-                       int r = verity_verify_level(io, io->block + b, i, false);
-                       if (unlikely(r))
-                               return r;
-               }
+       if (!*allowed)
+               return -1;
 
-test_block_hash:
-               desc = io_hash_desc(v, io);
-               desc->tfm = v->tfm;
-               desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-               r = crypto_shash_init(desc);
-               if (r < 0) {
-                       DMERR("crypto_shash_init failed: %d", r);
-                       return r;
-               }
+       /* Convert to the integer index matching the enum. */
+       return allowed - allowed_error_behaviors;
+}
 
-               if (likely(v->version >= 1)) {
-                       r = crypto_shash_update(desc, v->salt, v->salt_size);
-                       if (r < 0) {
-                               DMERR("crypto_shash_update failed: %d", r);
-                               return r;
-                       }
-               }
 
-               todo = 1 << v->data_dev_block_bits;
-               do {
-                       struct bio_vec *bv;
-                       u8 *page;
-                       unsigned len;
-
-                       BUG_ON(vector >= io->io_vec_size);
-                       bv = &io->io_vec[vector];
-                       page = kmap_atomic(bv->bv_page);
-                       len = bv->bv_len - offset;
-                       if (likely(len >= todo))
-                               len = todo;
-                       r = crypto_shash_update(desc,
-                                       page + bv->bv_offset + offset, len);
-                       kunmap_atomic(page);
-                       if (r < 0) {
-                               DMERR("crypto_shash_update failed: %d", r);
-                               return r;
-                       }
-                       offset += len;
-                       if (likely(offset == bv->bv_len)) {
-                               offset = 0;
-                               vector++;
-                       }
-                       todo -= len;
-               } while (todo);
-
-               if (!v->version) {
-                       r = crypto_shash_update(desc, v->salt, v->salt_size);
-                       if (r < 0) {
-                               DMERR("crypto_shash_update failed: %d", r);
-                               return r;
-                       }
-               }
+/**
+ * match_dev_by_uuid - callback for finding a partition using its uuid
+ * @dev:       device passed in by the caller
+ * @data:      opaque pointer to a uuid packed by part_pack_uuid().
+ *
+ * Returns 1 if the device matches, and 0 otherwise.
+ */
+static int match_dev_by_uuid(struct device *dev, void *data)
+{
+       u8 *uuid = data;
+       struct hd_struct *part = dev_to_part(dev);
 
-               result = io_real_digest(v, io);
-               r = crypto_shash_final(desc, result);
-               if (r < 0) {
-                       DMERR("crypto_shash_final failed: %d", r);
-                       return r;
-               }
-               if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                       DMERR_LIMIT("data block %llu is corrupted",
-                               (unsigned long long)(io->block + b));
-                       v->hash_failed = 1;
-                       return -EIO;
-               }
-       }
-       BUG_ON(vector != io->io_vec_size);
-       BUG_ON(offset);
+       if (!part->info)
+               goto no_match;
+
+       if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid)))
+                       goto no_match;
 
+       return 1;
+no_match:
        return 0;
 }
 
-/*
- * End one "io" structure with a given error.
+/**
+ * dm_get_device_by_uuid: claim a device using its UUID
+ * @ti:                        current dm_target
+ * @uuid_string:       36 byte UUID hex encoded
+ *                     (xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
+ * @dev_start:         offset in sectors passed to dm_get_device
+ * @dev_len:           length in sectors passed to dm_get_device
+ * @dm_dev:            dm_dev to populate
+ *
+ * Wraps dm_get_device allowing it to use a unique partition id to
+ * find a given partition on any drive. This code is based on
+ * printk_all_partitions in that it walks all of the register block devices.
+ *
+ * N.B., uuid_string is not checked for safety just strlen().
  */
-static void verity_finish_io(struct dm_verity_io *io, int error)
+static int dm_get_device_by_uuid(struct dm_target *ti, const char *uuid_str,
+                            sector_t dev_start, sector_t dev_len,
+                            struct dm_dev **dm_dev)
 {
-       struct bio *bio = io->bio;
-       struct dm_verity *v = io->v;
+       struct device *dev = NULL;
+       dev_t devt = 0;
+       char devt_buf[BDEVT_SIZE];
+       u8 uuid[16];
+       size_t uuid_length = strlen(uuid_str);
+
+       if (uuid_length < 36)
+               goto bad_uuid;
+       /* Pack the requested UUID in the expected format. */
+       part_pack_uuid(uuid_str, uuid);
+
+       dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid);
+       if (!dev)
+               goto found_nothing;
+
+       devt = dev->devt;
+       put_device(dev);
+
+       /* The caller may specify +/-%u after the UUID if they want a partition
+        * before or after the one identified.
+        */
+       if (uuid_length > 36) {
+               unsigned int part_offset;
+               char sign;
+               unsigned minor = MINOR(devt);
+               if (sscanf(uuid_str + 36, "%c%u", &sign, &part_offset) == 2) {
+                       if (sign == '+') {
+                               minor += part_offset;
+                       } else if (sign == '-') {
+                               minor -= part_offset;
+                       } else {
+                               DMWARN("Trailing characters after UUID: %s\n",
+                                       uuid_str);
+                       }
+                       devt = MKDEV(MAJOR(devt), minor);
+               }
+       }
 
-       bio->bi_end_io = io->orig_bi_end_io;
-       bio->bi_private = io->orig_bi_private;
+       /* Construct the dev name to pass to dm_get_device.  dm_get_device
+        * doesn't support being passed a dev_t.
+        */
+       snprintf(devt_buf, sizeof(devt_buf), "%u:%u", MAJOR(devt), MINOR(devt));
+
+       /* TODO(wad) to make this generic we could also pass in the mode. */
+       if (!dm_get_device(ti, devt_buf, dm_table_get_mode(ti->table), dm_dev))
+               return 0;
+
+       ti->error = "Failed to acquire device";
+       DMDEBUG("Failed to acquire discovered device %s", devt_buf);
+       return -1;
+bad_uuid:
+       ti->error = "Bad UUID";
+       DMDEBUG("Supplied value '%s' is an invalid UUID", uuid_str);
+       return -1;
+found_nothing:
+       DMDEBUG("No matching partition for GUID: %s", uuid_str);
+       ti->error = "No matching GUID";
+       return -1;
+}
 
-       if (io->io_vec != io->io_vec_inline)
-               mempool_free(io->io_vec, v->vec_mempool);
+static int verity_get_device(struct dm_target *ti, const char *devname,
+                            sector_t dev_start, sector_t dev_len,
+                            struct dm_dev **dm_dev)
+{
+       do {
+               /* Try the normal path first since if everything is ready, it
+                * will be the fastest.
+                */
+               if (!dm_get_device(ti, devname,
+                                  dm_table_get_mode(ti->table), dm_dev))
+                       return 0;
+
+               /* Try the device by partition UUID */
+               if (!dm_get_device_by_uuid(ti, devname, dev_start, dev_len,
+                                          dm_dev))
+                       return 0;
+
+               /* No need to be too aggressive since this is a slow path. */
+               msleep(500);
+       } while (dev_wait && (driver_probe_done() != 0 || *dm_dev == NULL));
+       async_synchronize_full();
+       return -1;
+}
 
-       mempool_free(io, v->io_mempool);
 
-       bio_endio(bio, error);
-}
+/*-----------------------------------------------------------------
+ * Reverse flow of requests into the device.
+ *
+ * (Start at the bottom with verity_map and work your way upward).
+ *-----------------------------------------------------------------*/
+
+static void verity_inc_pending(struct dm_verity_io *io);
 
-static void verity_work(struct work_struct *w)
+static void verity_return_bio_to_caller(struct dm_verity_io *io)
 {
-       struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
+       struct verity_config *vc = io->target->private;
+
+       if (io->error)
+               verity_error(vc, io, io->error);
 
-       verity_finish_io(io, verity_verify_io(io));
+       bio_endio(io->bio, io->error);
+       mempool_free(io, vc->io_pool);
 }
 
-static void verity_end_io(struct bio *bio, int error)
+/* Check for any missing bht hashes. */
+static bool verity_is_bht_populated(struct dm_verity_io *io)
 {
-       struct dm_verity_io *io = bio->bi_private;
+       struct verity_config *vc = io->target->private;
+       u64 block;
 
-       if (error) {
-               verity_finish_io(io, error);
-               return;
-       }
+       for (block = io->block; block < io->block + io->count; ++block)
+               if (!dm_bht_is_populated(&vc->bht, block))
+                       return false;
 
-       INIT_WORK(&io->work, verity_work);
-       queue_work(io->v->verify_wq, &io->work);
+       return true;
 }
 
-/*
- * Prefetch buffers for the specified io.
- * The root buffer is not prefetched, it is assumed that it will be cached
- * all the time.
+/* verity_dec_pending manages the lifetime of all dm_verity_io structs.
+ * Non-bug error handling is centralized through this interface and
+ * all passage from workqueue to workqueue.
  */
-static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+static void verity_dec_pending(struct dm_verity_io *io)
 {
-       int i;
-
-       for (i = v->levels - 2; i >= 0; i--) {
-               sector_t hash_block_start;
-               sector_t hash_block_end;
-               verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
-               verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
-               if (!i) {
-                       unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
-
-                       cluster >>= v->data_dev_block_bits;
-                       if (unlikely(!cluster))
-                               goto no_prefetch_cluster;
-
-                       if (unlikely(cluster & (cluster - 1)))
-                               cluster = 1 << (fls(cluster) - 1);
-
-                       hash_block_start &= ~(sector_t)(cluster - 1);
-                       hash_block_end |= cluster - 1;
-                       if (unlikely(hash_block_end >= v->hash_blocks))
-                               hash_block_end = v->hash_blocks - 1;
-               }
-no_prefetch_cluster:
-               dm_bufio_prefetch(v->bufio, hash_block_start,
-                                 hash_block_end - hash_block_start + 1);
+       struct verity_config *vc = io->target->private;
+       VERITY_BUG_ON(!io, "NULL argument");
+
+       DMDEBUG("dec pending %p: %d--", io, atomic_read(&io->pending));
+
+       if (!atomic_dec_and_test(&io->pending))
+               goto done;
+
+       if (unlikely(io->error))
+               goto io_error;
+
+       /* I/Os that were pending may now be ready */
+       if (verity_is_bht_populated(io)) {
+               verity_stats_io_queue_dec(vc);
+               verity_stats_verify_queue_inc(vc);
+               INIT_DELAYED_WORK(&io->work, kverityd_verify);
+               queue_delayed_work(kveritydq, &io->work, 0);
+               REQTRACE("Block %llu+ is being queued for verify (io:%p)",
+                        ULL(io->block), io);
+       } else {
+               INIT_DELAYED_WORK(&io->work, kverityd_io);
+               queue_delayed_work(kverityd_ioq, &io->work, HZ/10);
+               verity_stats_total_requeues_inc(vc);
+               REQTRACE("Block %llu+ is being requeued for io (io:%p)",
+                        ULL(io->block), io);
        }
+
+done:
+       return;
+
+io_error:
+       verity_return_bio_to_caller(io);
 }
 
-/*
- * Bio map function. It allocates dm_verity_io structure and bio vector and
- * fills them. Then it issues prefetches and the I/O.
+/* Walks the data set and computes the hash of the data read from the
+ * untrusted source device.  The computed hash is then passed to dm-bht
+ * for verification.
  */
-static int verity_map(struct dm_target *ti, struct bio *bio,
-                     union map_info *map_context)
+static int verity_verify(struct verity_config *vc,
+                        struct bio *bio)
 {
-       struct dm_verity *v = ti->private;
-       struct dm_verity_io *io;
+       unsigned int idx;
+       u64 block;
+       int r;
 
-       bio->bi_bdev = v->data_dev->bdev;
-       bio->bi_sector = verity_map_sector(v, bio->bi_sector);
+       VERITY_BUG_ON(bio == NULL);
 
-       if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
-           ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
-               DMERR_LIMIT("unaligned io");
-               return -EIO;
-       }
+       block = to_bytes(bio->bi_sector) >> VERITY_BLOCK_SHIFT;
 
-       if ((bio->bi_sector + bio_sectors(bio)) >>
-           (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
-               DMERR_LIMIT("io out of range");
-               return -EIO;
+       for (idx = bio->bi_idx; idx < bio->bi_vcnt; idx++) {
+               struct bio_vec *bv = bio_iovec_idx(bio, idx);
+
+               VERITY_BUG_ON(bv->bv_offset % VERITY_BLOCK_SIZE);
+               VERITY_BUG_ON(bv->bv_len % VERITY_BLOCK_SIZE);
+
+               DMDEBUG("Updating hash for block %llu", ULL(block));
+
+               /* TODO(msb) handle case where multiple blocks fit in a page */
+               r = dm_bht_verify_block(&vc->bht, block,
+                                       bv->bv_page, bv->bv_offset);
+               /* dm_bht functions aren't expected to return errno friendly
+                * values.  They are converted here for uniformity.
+                */
+               if (r > 0) {
+                       DMERR("Pending data for block %llu seen at verify",
+                             ULL(block));
+                       r = -EBUSY;
+                       goto bad_state;
+               }
+               if (r < 0) {
+                       DMERR_LIMIT("Block hash does not match!");
+                       r = -EACCES;
+                       goto bad_match;
+               }
+               REQTRACE("Block %llu verified", ULL(block));
+
+               block++;
+               /* After completing a block, allow a reschedule.
+                * TODO(wad) determine if this is truly needed.
+                */
+               cond_resched();
        }
 
-       if (bio_data_dir(bio) == WRITE)
-               return -EIO;
+       return 0;
 
-       io = mempool_alloc(v->io_mempool, GFP_NOIO);
-       io->v = v;
-       io->bio = bio;
-       io->orig_bi_end_io = bio->bi_end_io;
-       io->orig_bi_private = bio->bi_private;
-       io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
-       io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
-
-       bio->bi_end_io = verity_end_io;
-       bio->bi_private = io;
-       io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
-       if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
-               io->io_vec = io->io_vec_inline;
-       else
-               io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
-       memcpy(io->io_vec, bio_iovec(bio),
-              io->io_vec_size * sizeof(struct bio_vec));
-
-       verity_prefetch_io(v, io);
+bad_state:
+bad_match:
+       return r;
+}
 
-       generic_make_request(bio);
+/* Services the verify workqueue */
+static void kverityd_verify(struct work_struct *work)
+{
+       struct delayed_work *dwork = container_of(work, struct delayed_work,
+                                                 work);
+       struct dm_verity_io *io = container_of(dwork, struct dm_verity_io,
+                                              work);
+       struct verity_config *vc = io->target->private;
 
-       return DM_MAPIO_SUBMITTED;
+       io->error = verity_verify(vc, io->bio);
+
+       /* Free up the bio and tag with the return value */
+       verity_stats_verify_queue_dec(vc);
+       verity_return_bio_to_caller(io);
 }
 
-/*
- * Status: V (valid) or C (corruption found)
+/* Asynchronously called upon the completion of dm-bht I/O.  The status
+ * of the operation is passed back to dm-bht and the next steps are
+ * decided by verity_dec_pending.
  */
-static int verity_status(struct dm_target *ti, status_type_t type,
-                        char *result, unsigned maxlen)
+static void kverityd_io_bht_populate_end(struct bio *bio, int error)
 {
-       struct dm_verity *v = ti->private;
-       unsigned sz = 0;
-       unsigned x;
+       struct dm_bht_entry *entry = (struct dm_bht_entry *) bio->bi_private;
+       struct dm_verity_io *io = (struct dm_verity_io *) entry->io_context;
 
-       switch (type) {
-       case STATUSTYPE_INFO:
-               DMEMIT("%c", v->hash_failed ? 'C' : 'V');
-               break;
-       case STATUSTYPE_TABLE:
-               DMEMIT("%u %s %s %u %u %llu %llu %s ",
-                       v->version,
-                       v->data_dev->name,
-                       v->hash_dev->name,
-                       1 << v->data_dev_block_bits,
-                       1 << v->hash_dev_block_bits,
-                       (unsigned long long)v->data_blocks,
-                       (unsigned long long)v->hash_start,
-                       v->alg_name
-                       );
-               for (x = 0; x < v->digest_size; x++)
-                       DMEMIT("%02x", v->root_digest[x]);
-               DMEMIT(" ");
-               if (!v->salt_size)
-                       DMEMIT("-");
-               else
-                       for (x = 0; x < v->salt_size; x++)
-                               DMEMIT("%02x", v->salt[x]);
-               break;
+       DMDEBUG("kverityd_io_bht_populate_end (io:%p, entry:%p)", io, entry);
+       /* Tell the tree to atomically update now that we've populated
+        * the given entry.
+        */
+       dm_bht_read_completed(entry, error);
+
+       /* Clean up for reuse when reading data to be checked */
+       bio->bi_vcnt = 0;
+       bio->bi_io_vec->bv_offset = 0;
+       bio->bi_io_vec->bv_len = 0;
+       bio->bi_io_vec->bv_page = NULL;
+       /* Restore the private data to I/O so the destructor can be shared. */
+       bio->bi_private = (void *) io;
+       bio_put(bio);
+
+       /* We bail but assume the tree has been marked bad. */
+       if (unlikely(error)) {
+               DMERR("Failed to read for sector %llu (%u)",
+                     ULL(io->bio->bi_sector), io->bio->bi_size);
+               io->error = error;
+               /* Pass through the error to verity_dec_pending below */
        }
+       /* When pending = 0, it will transition to reading real data */
+       verity_dec_pending(io);
+}
 
+/* Called by dm-bht (via dm_bht_populate), this function provides
+ * the message digests to dm-bht that are stored on disk.
+ */
+static int kverityd_bht_read_callback(void *ctx, sector_t start, u8 *dst,
+                                     sector_t count,
+                                     struct dm_bht_entry *entry)
+{
+       struct dm_verity_io *io = ctx;  /* I/O for this batch */
+       struct verity_config *vc;
+       struct bio *bio;
+       /* Explicitly catches these so we can use a custom bug route */
+       VERITY_BUG_ON(!io || !dst || !io->target || !io->target->private);
+       VERITY_BUG_ON(!entry);
+       VERITY_BUG_ON(count != to_sector(VERITY_BLOCK_SIZE));
+
+       vc = io->target->private;
+
+       /* The I/O context is nested inside the entry so that we don't need one
+        * io context per page read.
+        */
+       entry->io_context = ctx;
+
+       /* We should only get page size requests at present. */
+       verity_inc_pending(io);
+       bio = verity_alloc_bioset(vc, GFP_NOIO, 1);
+       if (unlikely(!bio)) {
+               DMCRIT("Out of memory at bio_alloc_bioset");
+               dm_bht_read_completed(entry, -ENOMEM);
+               return -ENOMEM;
+       }
+       bio->bi_private = (void *) entry;
+       bio->bi_idx = 0;
+       bio->bi_size = VERITY_BLOCK_SIZE;
+       bio->bi_sector = vc->hash_start + start;
+       bio->bi_bdev = vc->hash_dev->bdev;
+       bio->bi_end_io = kverityd_io_bht_populate_end;
+       bio->bi_rw = REQ_META;
+       /* Only need to free the bio since the page is managed by bht */
+       bio->bi_destructor = dm_verity_bio_destructor;
+       bio->bi_vcnt = 1;
+       bio->bi_io_vec->bv_offset = 0;
+       bio->bi_io_vec->bv_len = to_bytes(count);
+       /* dst is guaranteed to be a page_pool allocation */
+       bio->bi_io_vec->bv_page = virt_to_page(dst);
+       /* Track that this I/O is in use.  There should be no risk of the io
+        * being removed prior since this is called synchronously.
+        */
+       DMDEBUG("Submitting bht io %p (entry:%p)", io, entry);
+       vc->stats.bht_requests++;
+       generic_make_request(bio);
        return 0;
 }
 
-static int verity_ioctl(struct dm_target *ti, unsigned cmd,
-                       unsigned long arg)
+/* Submits an io request for each missing block of block hashes.
+ * The last one to return will then enqueue this on the io workqueue.
+ */
+static void kverityd_io_bht_populate(struct dm_verity_io *io)
 {
-       struct dm_verity *v = ti->private;
-       int r = 0;
+       struct verity_config *vc = io->target->private;
+       u64 block;
+
+       REQTRACE("populating %llu starting at block %llu (io:%p)",
+                ULL(io->count), ULL(io->block), io);
+       for (block = io->block; block < io->block + io->count; ++block) {
+               int populated;
+
+               DMDEBUG("Calling dm_bht_populate for %ull (io:%p)",
+                       ULL(block), io);
+               populated = dm_bht_populate(&vc->bht, io, block);
+               if (populated < 0) {
+                       DMCRIT("dm_bht_populate error: block %llu (io:%p): %d",
+                              ULL(block), io, populated);
+                       /* TODO(wad) support propagating transient errors
+                        *           cleanly.
+                        */
+                       /* verity_dec_pending will handle the error case. */
+                       io->error = -EPERM;
+                       break;
+               }
+       }
+       REQTRACE("Block %llu+ initiated %d requests (io: %p)",
+                ULL(io->block), atomic_read(&io->pending) - 1, io);
+}
+
+/* Asynchronously called upon the completion of I/O issued
+ * from kverityd_src_io_read. verity_dec_pending() acts as
+ * the scheduler/flow manager.
+ */
+static void kverityd_src_io_read_end(struct bio *clone, int error)
+{
+       struct dm_verity_io *io = clone->bi_private;
+
+       DMDEBUG("I/O completed");
+       if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
+               error = -EIO;
 
-       if (v->data_start ||
-           ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
-               r = scsi_verify_blk_ioctl(NULL, cmd);
+       if (unlikely(error)) {
+               DMERR("Error occurred: %d (%llu, %u)",
+                       error, ULL(clone->bi_sector), clone->bi_size);
+               io->error = error;
+       }
+
+       /* Release the clone which just avoids the block layer from
+        * leaving offsets, etc in unexpected states.
+        */
+       bio_put(clone);
 
-       return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
-                                    cmd, arg);
+       verity_dec_pending(io);
+       DMDEBUG("all data has been loaded from the data device");
 }
 
-static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-                       struct bio_vec *biovec, int max_size)
+/* If not yet underway, an I/O request will be issued to the vc->dev
+ * device for the data needed. It is cloned to avoid unexpected changes
+ * to the original bio struct.
+ */
+static void kverityd_src_io_read(struct dm_verity_io *io)
 {
-       struct dm_verity *v = ti->private;
-       struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
+       struct verity_config *vc = io->target->private;
+       struct bio *clone;
 
-       if (!q->merge_bvec_fn)
-               return max_size;
+       VERITY_BUG_ON(!io);
+
+       /* If clone is non-NULL, then the read is already issued. Could also
+        * check BIO_UPTODATE, but it doesn't seem needed.
+        */
+       if (io->flags & VERITY_IOFLAGS_CLONED) {
+               DMDEBUG("io_read called with existing bio. bailing: %p", io);
+               return;
+       }
+       io->flags |= VERITY_IOFLAGS_CLONED;
 
-       bvm->bi_bdev = v->data_dev->bdev;
-       bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
+       DMDEBUG("kverity_io_read started");
 
-       return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+       /* Clone the bio. The block layer may modify the bvec array. */
+       DMDEBUG("Creating clone of the request");
+       ALLOCTRACE("clone for io %p, sector %llu",
+                  io, ULL(vc->start + io->sector));
+       clone = verity_bio_clone(io);
+       if (unlikely(!clone)) {
+               io->error = -ENOMEM;
+               return;
+       }
+
+       verity_inc_pending(io);
+
+       /* Submit to the block device */
+       DMDEBUG("Submitting bio");
+       /* XXX: check queue_max_hw_sectors(bdev_get_queue(clone->bi_bdev)); */
+       generic_make_request(clone);
 }
 
-static int verity_iterate_devices(struct dm_target *ti,
-                                 iterate_devices_callout_fn fn, void *data)
+/* kverityd_io services the I/O workqueue. For each pass through
+ * the I/O workqueue, a call to populate both the origin drive
+ * data and the hash tree data is made.
+ */
+static void kverityd_io(struct work_struct *work)
 {
-       struct dm_verity *v = ti->private;
-
-       return fn(ti, v->data_dev, v->data_start, ti->len, data);
+       struct delayed_work *dwork = container_of(work, struct delayed_work,
+                                                 work);
+       struct dm_verity_io *io = container_of(dwork, struct dm_verity_io,
+                                              work);
+       VERITY_BUG_ON(!io->bio);
+
+       /* Issue requests asynchronously. */
+       verity_inc_pending(io);
+       kverityd_src_io_read(io);
+       kverityd_io_bht_populate(io);
+       verity_dec_pending(io);
 }
 
-static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+/* Paired with verity_dec_pending, the pending value in the io dictate the
+ * lifetime of a request and when it is ready to be processed on the
+ * workqueues.
+ */
+static void verity_inc_pending(struct dm_verity_io *io)
 {
-       struct dm_verity *v = ti->private;
+       atomic_inc(&io->pending);
+}
+
+/* Block-level requests start here. */
+static int verity_map(struct dm_target *ti, struct bio *bio,
+                     union map_info *map_context) {
+       struct dm_verity_io *io;
+       struct verity_config *vc;
+       struct request_queue *r_queue;
 
-       if (limits->logical_block_size < 1 << v->data_dev_block_bits)
-               limits->logical_block_size = 1 << v->data_dev_block_bits;
+       if (unlikely(!ti)) {
+               DMERR("dm_target was NULL");
+               return -EIO;
+       }
 
-       if (limits->physical_block_size < 1 << v->data_dev_block_bits)
-               limits->physical_block_size = 1 << v->data_dev_block_bits;
+       vc = ti->private;
+       r_queue = bdev_get_queue(vc->dev->bdev);
+
+       /* Trace incoming bios */
+       REQTRACE("Got a %s for %llu, %u bytes)",
+               (bio_rw(bio) == WRITE ? "WRITE" :
+               (bio_rw(bio) == READ ? "READ" : "READA")),
+               ULL(bio->bi_sector), bio->bi_size);
+
+       verity_stats_total_requests_inc(vc);
+
+       if (bio_data_dir(bio) == WRITE) {
+               /* If we silently drop writes, then the VFS layer will cache
+                * the write and persist it in memory. While it doesn't change
+                * the underlying storage, it still may be contrary to the
+                * behavior expected by a verified, read-only device.
+                */
+               DMWARN_LIMIT("write request received. rejecting with -EIO.");
+               verity_error(vc, NULL, -EIO);
+               /* bio_endio(bio, -EIO); */
+               return -EIO;
+       } else {
+               VERITY_BUG_ON(bio->bi_sector % to_sector(VERITY_BLOCK_SIZE));
+               VERITY_BUG_ON(bio->bi_size % VERITY_BLOCK_SIZE);
+
+               /* Queue up the request to be verified */
+               io = verity_io_alloc(ti, bio, bio->bi_sector - ti->begin);
+               if (!io) {
+                       DMERR_LIMIT("Failed to allocate and init IO data");
+                       return DM_MAPIO_REQUEUE;
+               }
+               verity_stats_io_queue_inc(vc);
+               vc->stats.total_blocks += io->count;
+               vc->stats.io_by_block_size[ilog2(io->count)]++;
+
+               vc->stats.total_size += bio->bi_size;
+               vc->stats.io_size_by_block_size[ilog2(io->count)] +=
+                       bio->bi_size;
+               INIT_DELAYED_WORK(&io->work, kverityd_io);
+               queue_delayed_work(kverityd_ioq, &io->work, 0);
+       }
 
-       blk_limits_io_min(limits, limits->logical_block_size);
+       return DM_MAPIO_SUBMITTED;
 }
 
-static void verity_dtr(struct dm_target *ti)
+static int verity_stats_seq_show(struct seq_file *seq, void *offset)
 {
-       struct dm_verity *v = ti->private;
-
-       if (v->verify_wq)
-               destroy_workqueue(v->verify_wq);
-
-       if (v->vec_mempool)
-               mempool_destroy(v->vec_mempool);
+       struct verity_config *vc = seq->private;
+       struct verity_stats *stats = &vc->stats;
+       unsigned long long running_total;
+       int i;
 
-       if (v->io_mempool)
-               mempool_destroy(v->io_mempool);
+       seq_printf(seq, "%d\tI/O queue pending\n", (int)stats->io_queue);
+       seq_printf(seq, "%u\tVerify queue pending\n", stats->verify_queue);
+       seq_printf(seq, "%lu\tHash block requests\n", stats->bht_requests);
+       seq_printf(seq, "%llu\tTotal re-queues\n", stats->total_requeues);
+       seq_printf(seq, "%llu\tTotal requests\n", stats->total_requests);
+       seq_printf(seq, "%lluMB\tTotal size\n", stats->total_size >> 20);
+       seq_printf(seq, "%llu\tTotal blocks\n", stats->total_blocks);
+       for (running_total = i = 0; i < 30; i++) {
+               if (stats->io_by_block_size[i]) {
+                       running_total += stats->io_size_by_block_size[i];
+                       seq_printf(seq, "%lu\tRequests of size %u-%u"
+                               " (%uKB to %uKB), %lluKB, "
+                               "run.tot. = %lluMB\n",
+                               stats->io_by_block_size[i],
+                               1U << i, (2U << i) - 1,
+                               1U << i << VERITY_BLOCK_SHIFT >> 10,
+                               ((2U << i) - 1) << VERITY_BLOCK_SHIFT >> 10,
+                               stats->io_size_by_block_size[i] >> 10,
+                               running_total >> 20);
+               }
+       }
 
-       if (v->bufio)
-               dm_bufio_client_destroy(v->bufio);
+       return 0;
+}
 
-       kfree(v->salt);
-       kfree(v->root_digest);
+static int verity_stats_open_fs(struct inode *inode, struct file *file)
+{
+       return single_open(file, verity_stats_seq_show, inode->i_private);
+}
 
-       if (v->tfm)
-               crypto_free_shash(v->tfm);
+static const struct file_operations verity_stats_fops = {
+       .owner = THIS_MODULE,
+       .open = verity_stats_open_fs,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
 
-       kfree(v->alg_name);
+static int verity_init_debugfs(struct verity_config *vc)
+{
+       struct dentry *dir, *stats;
+
+       dir = debugfs_create_dir(vc->name, debugfs_root);
+       if (!dir)
+               goto cant_create_dir;
+       stats = debugfs_create_file("stats",
+                       S_IFREG | S_IRUSR | S_IRGRP | S_IROTH,
+                       dir, vc, &verity_stats_fops);
+       if (!stats)
+               goto cant_create_file;
+
+       vc->debugfs_dir = dir;
+       return 0;
 
-       if (v->hash_dev)
-               dm_put_device(ti, v->hash_dev);
+cant_create_file:
+       debugfs_remove_recursive(dir);
+cant_create_dir:
+       return -ENODEV;
+}
 
-       if (v->data_dev)
-               dm_put_device(ti, v->data_dev);
+static void verity_cleanup_debugfs(struct verity_config *vc)
+{
+       debugfs_remove_recursive(vc->debugfs_dir);
+}
 
-       kfree(v);
+static void splitarg(char *arg, char **key, char **val) {
+       *key = strsep(&arg, "=");
+       *val = strsep(&arg, "");
 }
 
 /*
- * Target parameters:
- *     <version>       The current format is version 1.
- *                     Vsn 0 is compatible with original Chromium OS releases.
- *     <data device>
- *     <hash device>
- *     <data block size>
- *     <hash block size>
- *     <the number of data blocks>
- *     <hash start block>
- *     <algorithm>
- *     <digest>
- *     <salt>          Hex string or "-" if no salt.
+ * Non-block interfaces and device-mapper specific code
+ */
+
+/**
+ * verity_ctr - Construct a verified mapping
+ * @ti:   Target being created
+ * @argc: Number of elements in argv
+ * @argv: Vector of key-value pairs (see below).
+ *
+ * Accepts the following keys:
+ * @payload:        hashed device
+ * @hashtree:       device hashtree is stored on
+ * @hashstart:      start address of hashes (default 0)
+ * @alg:            hash algorithm
+ * @root_hexdigest: toplevel hash of the tree
+ * @error_behavior: what to do when verification fails [optional]
+ * @salt:           salt, in hex [optional]
+ *
+ * E.g.,
+ * payload=/dev/sda2 hashtree=/dev/sda3 alg=sha256
+ * root_hexdigest=f08aa4a3695290c569eb1b0ac032ae1040150afb527abbeb0a3da33d82fb2c6e
+ *
+ * TODO(wad):
+ * - Boot time addition
+ * - Track block verification to free block_hashes if memory use is a concern
+ * Testing needed:
+ * - Regular slub_debug tracing (on checkins)
+ * - Improper block hash padding
+ * - Improper bundle padding
+ * - Improper hash layout
+ * - Missing padding at end of device
+ * - Improperly sized underlying devices
+ * - Out of memory conditions (make sure this isn't too flaky under high load!)
+ * - Incorrect superhash
+ * - Incorrect block hashes
+ * - Incorrect bundle hashes
+ * - Boot-up read speed; sustained read speeds
  */
-static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
-       struct dm_verity *v;
-       unsigned num;
-       unsigned long long num_ll;
-       int r;
+       struct verity_config *vc = NULL;
+       int ret = 0;
+       sector_t blocks;
+       const char *payload = NULL;
+       const char *hashtree = NULL;
+       unsigned long hashstart = 0;
+       const char *alg = NULL;
+       const char *root_hexdigest = NULL;
+       const char *dev_error_behavior = error_behavior;
+       const char *hexsalt = NULL;
        int i;
-       sector_t hash_position;
-       char dummy;
 
-       v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
-       if (!v) {
-               ti->error = "Cannot allocate verity structure";
-               return -ENOMEM;
+       if (argc >= 6 && !strchr(argv[3], '=')) {
+               /* Transitional hack - support the old positional-argument format.
+                * Detect it because it requires specifying an unused arg
+                * (depth) which does not contain an '='. */
+               unsigned long long tmpull;
+               if (strcmp(argv[3], "0")) {
+                       ti->error = "Non-zero depth supplied";
+                       return -EINVAL;
+               }
+               if (sscanf(argv[2], "%llu", &tmpull) != 1) {
+                       ti->error = "Invalid hash_start supplied";
+                       return -EINVAL;
+               }
+               payload = argv[0];
+               hashtree = argv[1];
+               hashstart = tmpull;
+               alg = argv[4];
+               root_hexdigest = argv[5];
+               if (argc > 6)
+                       dev_error_behavior = argv[6];
+       } else {
+               for (i = 0; i < argc; ++i) {
+                       char *key, *val;
+                       DMWARN("Argument %d: '%s'", i, argv[i]);
+                       splitarg(argv[i], &key, &val);
+                       if (!key) {
+                               DMWARN("Bad argument %d: missing key?", i);
+                               break;
+                       }
+                       if (!val) {
+                               DMWARN("Bad argument %d='%s': missing value", i, key);
+                               break;
+                       }
+                       if (!strcmp(key, "alg")) {
+                               alg = val;
+                       } else if (!strcmp(key, "payload")) {
+                               payload = val;
+                       } else if (!strcmp(key, "hashtree")) {
+                               hashtree = val;
+                       } else if (!strcmp(key, "root_hexdigest")) {
+                               root_hexdigest = val;
+                       } else if (!strcmp(key, "hashstart")) {
+                               if (strict_strtoul(val, 10, &hashstart)) {
+                                       ti->error = "Invalid hashstart";
+                                       return -EINVAL;
+                               }
+                       } else if (!strcmp(key, "error_behavior")) {
+                               dev_error_behavior = val;
+                       } else if (!strcmp(key, "salt")) {
+                               hexsalt = val;
+                       }
+               }
        }
-       ti->private = v;
-       v->ti = ti;
 
-       if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
-               ti->error = "Device must be readonly";
-               r = -EINVAL;
-               goto bad;
+#define NEEDARG(n) \
+       if (!(n)) { \
+               ti->error = "Missing argument: " #n; \
+               return -EINVAL; \
        }
 
-       if (argc != 10) {
-               ti->error = "Invalid argument count: exactly 10 arguments required";
-               r = -EINVAL;
-               goto bad;
-       }
+       NEEDARG(alg);
+       NEEDARG(payload);
+       NEEDARG(hashtree);
+       NEEDARG(root_hexdigest);
 
-       if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
-           num < 0 || num > 1) {
-               ti->error = "Invalid version";
-               r = -EINVAL;
-               goto bad;
-       }
-       v->version = num;
+#undef NEEDARG
 
-       r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
-       if (r) {
-               ti->error = "Data device lookup failed";
-               goto bad;
+       /* The device mapper device should be setup read-only */
+       if ((dm_table_get_mode(ti->table) & ~FMODE_READ) != 0) {
+               ti->error = "Must be created readonly.";
+               return -EINVAL;
        }
 
-       r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
-       if (r) {
-               ti->error = "Data device lookup failed";
-               goto bad;
+       ALLOCTRACE("verity_config");
+       vc = kzalloc(sizeof(*vc), GFP_KERNEL);
+       if (!vc) {
+               /* TODO(wad) if this is called from the setup helper, then we
+                * catch these errors and do a CrOS specific thing. if not, we
+                * need to have this call the error handler.
+                */
+               return -EINVAL;
        }
 
-       if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
-           !num || (num & (num - 1)) ||
-           num < bdev_logical_block_size(v->data_dev->bdev) ||
-           num > PAGE_SIZE) {
-               ti->error = "Invalid data device block size";
-               r = -EINVAL;
-               goto bad;
-       }
-       v->data_dev_block_bits = ffs(num) - 1;
+       /* For the name, use the payload default with / changed to _ */
+       vc->name = dm_disk(dm_table_get_md(ti->table))->disk_name;
 
-       if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
-           !num || (num & (num - 1)) ||
-           num < bdev_logical_block_size(v->hash_dev->bdev) ||
-           num > INT_MAX) {
-               ti->error = "Invalid hash device block size";
-               r = -EINVAL;
-               goto bad;
-       }
-       v->hash_dev_block_bits = ffs(num) - 1;
+       if (verity_init_debugfs(vc))
+               goto bad_debugfs;
 
-       if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
-           num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
-           (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
-               ti->error = "Invalid data blocks";
-               r = -EINVAL;
-               goto bad;
+       /* Calculate the blocks from the given device size */
+       vc->size = ti->len;
+       blocks = to_bytes(vc->size) >> VERITY_BLOCK_SHIFT;
+       if (dm_bht_create(&vc->bht, blocks, alg)) {
+               DMERR("failed to create required bht");
+               goto bad_bht;
        }
-       v->data_blocks = num_ll;
-
-       if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
-               ti->error = "Data device is too small";
-               r = -EINVAL;
-               goto bad;
+       if (dm_bht_set_root_hexdigest(&vc->bht, root_hexdigest)) {
+               DMERR("root hexdigest error");
+               goto bad_root_hexdigest;
        }
-
-       if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
-           num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
-           (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
-               ti->error = "Invalid hash start";
-               r = -EINVAL;
-               goto bad;
+       if (hexsalt)
+               dm_bht_set_salt(&vc->bht, hexsalt);
+       dm_bht_set_read_cb(&vc->bht, kverityd_bht_read_callback);
+
+       /* payload: device to verify */
+       vc->start = 0;  /* TODO: should this support a starting offset? */
+       /* We only ever grab the device in read-only mode. */
+       ret = verity_get_device(ti, payload, vc->start, ti->len, &vc->dev);
+       if (ret) {
+               DMERR("Failed to acquire device '%s': %d", payload, ret);
+               ti->error = "Device lookup failed";
+               goto bad_verity_dev;
        }
-       v->hash_start = num_ll;
 
-       v->alg_name = kstrdup(argv[7], GFP_KERNEL);
-       if (!v->alg_name) {
-               ti->error = "Cannot allocate algorithm name";
-               r = -ENOMEM;
-               goto bad;
+       if ((to_bytes(vc->start) % VERITY_BLOCK_SIZE) ||
+           (to_bytes(vc->size) % VERITY_BLOCK_SIZE)) {
+               ti->error = "Device must be VERITY_BLOCK_SIZE divisble/aligned";
+               goto bad_hash_start;
        }
 
-       v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
-       if (IS_ERR(v->tfm)) {
-               ti->error = "Cannot initialize hash function";
-               r = PTR_ERR(v->tfm);
-               v->tfm = NULL;
-               goto bad;
-       }
-       v->digest_size = crypto_shash_digestsize(v->tfm);
-       if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
-               ti->error = "Digest size too big";
-               r = -EINVAL;
-               goto bad;
-       }
-       v->shash_descsize =
-               sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
-
-       v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
-       if (!v->root_digest) {
-               ti->error = "Cannot allocate root digest";
-               r = -ENOMEM;
-               goto bad;
-       }
-       if (strlen(argv[8]) != v->digest_size * 2 ||
-           hex2bin(v->root_digest, argv[8], v->digest_size)) {
-               ti->error = "Invalid root digest";
-               r = -EINVAL;
-               goto bad;
-       }
-
-       if (strcmp(argv[9], "-")) {
-               v->salt_size = strlen(argv[9]) / 2;
-               v->salt = kmalloc(v->salt_size, GFP_KERNEL);
-               if (!v->salt) {
-                       ti->error = "Cannot allocate salt";
-                       r = -ENOMEM;
-                       goto bad;
-               }
-               if (strlen(argv[9]) != v->salt_size * 2 ||
-                   hex2bin(v->salt, argv[9], v->salt_size)) {
-                       ti->error = "Invalid salt";
-                       r = -EINVAL;
-                       goto bad;
-               }
-       }
+       vc->hash_start = (sector_t)hashstart;
 
-       v->hash_per_block_bits =
-               fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
-
-       v->levels = 0;
-       if (v->data_blocks)
-               while (v->hash_per_block_bits * v->levels < 64 &&
-                      (unsigned long long)(v->data_blocks - 1) >>
-                      (v->hash_per_block_bits * v->levels))
-                       v->levels++;
-
-       if (v->levels > DM_VERITY_MAX_LEVELS) {
-               ti->error = "Too many tree levels";
-               r = -E2BIG;
-               goto bad;
-       }
-
-       hash_position = v->hash_start;
-       for (i = v->levels - 1; i >= 0; i--) {
-               sector_t s;
-               v->hash_level_block[i] = hash_position;
-               s = verity_position_at_level(v, v->data_blocks, i);
-               s = (s >> v->hash_per_block_bits) +
-                   !!(s & ((1 << v->hash_per_block_bits) - 1));
-               if (hash_position + s < hash_position) {
-                       ti->error = "Hash device offset overflow";
-                       r = -E2BIG;
-                       goto bad;
-               }
-               hash_position += s;
+       /* hashtree: device with hashes.
+        * Note, payload == hashtree is okay as long as the size of
+        *       ti->len passed to device mapper does not include
+        *       the hashes.
+        */
+       if (verity_get_device(ti, hashtree, vc->hash_start,
+                             dm_bht_sectors(&vc->bht), &vc->hash_dev)) {
+               ti->error = "Hash device lookup failed";
+               goto bad_hash_dev;
        }
-       v->hash_blocks = hash_position;
 
-       v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
-               1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
-               dm_bufio_alloc_callback, NULL);
-       if (IS_ERR(v->bufio)) {
-               ti->error = "Cannot initialize dm-bufio";
-               r = PTR_ERR(v->bufio);
-               v->bufio = NULL;
-               goto bad;
+       /* We leave the validity on the hash device open until the
+        * next arg.  Then we go ahead and try to read in all the bundle
+        * hashes which live after the block hashes.  If it fails, then
+        * the hash offset was wrong.
+        */
+
+
+       /* arg4: cryptographic digest algorithm */
+       if (snprintf(vc->hash_alg, CRYPTO_MAX_ALG_NAME, "%s", alg) >=
+           CRYPTO_MAX_ALG_NAME) {
+               ti->error = "Hash algorithm name is too long";
+               goto bad_hash;
        }
 
-       if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
-               ti->error = "Hash device is too small";
-               r = -E2BIG;
-               goto bad;
+       /* override with optional device-specific error behavior */
+       vc->error_behavior = verity_parse_error_behavior(dev_error_behavior);
+       if (vc->error_behavior == -1) {
+               ti->error = "Bad error_behavior supplied";
+               goto bad_err_behavior;
        }
 
-       v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
-         sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
-       if (!v->io_mempool) {
-               ti->error = "Cannot allocate io mempool";
-               r = -ENOMEM;
-               goto bad;
+       /* TODO: Maybe issues a request on the io queue for block 0? */
+
+       /* Argument processing is done, setup operational data */
+       /* Pool for dm_verity_io objects */
+       ALLOCTRACE("slab pool for io objects");
+       vc->io_pool = mempool_create_slab_pool(MIN_IOS, _verity_io_pool);
+       if (!vc->io_pool) {
+               ti->error = "Cannot allocate verity io mempool";
+               goto bad_slab_pool;
        }
 
-       v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
-                                       BIO_MAX_PAGES * sizeof(struct bio_vec));
-       if (!v->vec_mempool) {
-               ti->error = "Cannot allocate vector mempool";
-               r = -ENOMEM;
-               goto bad;
+       /* Allocate the bioset used for request padding */
+       /* TODO(wad) allocate a separate bioset for the first verify maybe */
+       ALLOCTRACE("bioset for I/O reqs");
+       vc->bs = bioset_create(MIN_BIOS, 0);
+       if (!vc->bs) {
+               ti->error = "Cannot allocate verity bioset";
+               goto bad_bs;
        }
 
-       /* WQ_UNBOUND greatly improves performance when running on ramdisk */
-       v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
-       if (!v->verify_wq) {
-               ti->error = "Cannot allocate workqueue";
-               r = -ENOMEM;
-               goto bad;
+       ti->num_flush_requests = 1;
+       ti->private = vc;
+
+       /* TODO(wad) add device and hash device names */
+       {
+               char hashdev[BDEVNAME_SIZE], vdev[BDEVNAME_SIZE];
+               bdevname(vc->hash_dev->bdev, hashdev);
+               bdevname(vc->dev->bdev, vdev);
+               DMINFO("dev:%s hash:%s [sectors:%llu blocks:%llu]", vdev,
+                      hashdev, ULL(dm_bht_sectors(&vc->bht)), ULL(blocks));
        }
+       return 0;
+
+bad_bs:
+       mempool_destroy(vc->io_pool);
+bad_slab_pool:
+bad_err_behavior:
+bad_hash:
+       dm_put_device(ti, vc->hash_dev);
+bad_hash_dev:
+bad_hash_start:
+       dm_put_device(ti, vc->dev);
+bad_bht:
+bad_root_hexdigest:
+bad_verity_dev:
+       verity_cleanup_debugfs(vc);
+bad_debugfs:
+       kfree(vc);   /* hash is not secret so no need to zero */
+       return -EINVAL;
+}
+
+static void verity_dtr(struct dm_target *ti)
+{
+       struct verity_config *vc = (struct verity_config *) ti->private;
+
+       DMDEBUG("Destroying bs");
+       bioset_free(vc->bs);
+       DMDEBUG("Destroying io_pool");
+       mempool_destroy(vc->io_pool);
+
+       DMDEBUG("Destroying block hash tree");
+       dm_bht_destroy(&vc->bht);
+
+       DMDEBUG("Putting hash_dev");
+       dm_put_device(ti, vc->hash_dev);
+
+       DMDEBUG("Putting dev");
+       dm_put_device(ti, vc->dev);
+
+       DMDEBUG("Removing debugfs dir");
+       verity_cleanup_debugfs(vc);
+
+       DMDEBUG("Destroying config");
+       kfree(vc);
+}
+
+static int verity_status(struct dm_target *ti, status_type_t type,
+                       char *result, unsigned int maxlen) {
+       struct verity_config *vc = (struct verity_config *) ti->private;
+       unsigned int sz = 0;
+       char hashdev[BDEVNAME_SIZE], vdev[BDEVNAME_SIZE];
+       u8 hexdigest[VERITY_MAX_DIGEST_SIZE * 2 + 1] = { 0 };
 
+       dm_bht_root_hexdigest(&vc->bht, hexdigest, sizeof(hexdigest));
+
+       switch (type) {
+       case STATUSTYPE_INFO:
+               DMEMIT("%u %u %u %llu %llu",
+                      vc->stats.io_queue,
+                      vc->stats.verify_queue,
+                      vc->stats.average_requeues,
+                      vc->stats.total_requeues,
+                      vc->stats.total_requests);
+               break;
+
+       case STATUSTYPE_TABLE:
+               bdevname(vc->hash_dev->bdev, hashdev);
+               bdevname(vc->dev->bdev, vdev);
+               DMEMIT("/dev/%s /dev/%s %llu %u %s %s",
+                       vdev,
+                       hashdev,
+                       ULL(vc->hash_start),
+                       vc->bht.depth,
+                       vc->hash_alg,
+                       hexdigest);
+               break;
+       }
        return 0;
+}
+
+static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                      struct bio_vec *biovec, int max_size)
+{
+       struct verity_config *vc = ti->private;
+       struct request_queue *q = bdev_get_queue(vc->dev->bdev);
 
-bad:
-       verity_dtr(ti);
+       if (!q->merge_bvec_fn)
+               return max_size;
 
-       return r;
+       bvm->bi_bdev = vc->dev->bdev;
+       bvm->bi_sector = vc->start + bvm->bi_sector - ti->begin;
+
+       /* Optionally, this could just return 0 to stick to single pages. */
+       return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int verity_iterate_devices(struct dm_target *ti,
+                                iterate_devices_callout_fn fn, void *data)
+{
+       struct verity_config *vc = ti->private;
+
+       return fn(ti, vc->dev, vc->start, ti->len, data);
+}
+
+static void verity_io_hints(struct dm_target *ti,
+                           struct queue_limits *limits)
+{
+       limits->logical_block_size = VERITY_BLOCK_SIZE;
+       limits->physical_block_size = VERITY_BLOCK_SIZE;
+       blk_limits_io_min(limits, VERITY_BLOCK_SIZE);
 }
 
 static struct target_type verity_target = {
-       .name           = "verity",
-       .version        = {1, 0, 0},
-       .module         = THIS_MODULE,
-       .ctr            = verity_ctr,
-       .dtr            = verity_dtr,
-       .map            = verity_map,
-       .status         = verity_status,
-       .ioctl          = verity_ioctl,
-       .merge          = verity_merge,
+       .name   = "verity",
+       .version = {0, 1, 0},
+       .module = THIS_MODULE,
+       .ctr    = verity_ctr,
+       .dtr    = verity_dtr,
+       .map    = verity_map,
+       .merge  = verity_merge,
+       .status = verity_status,
        .iterate_devices = verity_iterate_devices,
-       .io_hints       = verity_io_hints,
+       .io_hints = verity_io_hints,
 };
 
+#define VERITY_WQ_FLAGS (WQ_CPU_INTENSIVE|WQ_HIGHPRI)
+
 static int __init dm_verity_init(void)
 {
-       int r;
+       int r = -ENOMEM;
+
+       debugfs_root = debugfs_create_dir("dm-verity", NULL);
+       if (!debugfs_root) {
+               DMERR("failed to create debugfs directory");
+               r = -ENODEV;
+               goto bad_debugfs_dir;
+       }
+
+       _verity_io_pool = KMEM_CACHE(dm_verity_io, 0);
+       if (!_verity_io_pool) {
+               DMERR("failed to allocate pool dm_verity_io");
+               goto bad_io_pool;
+       }
+
+       kverityd_ioq = alloc_workqueue("kverityd_io", VERITY_WQ_FLAGS, 1);
+       if (!kverityd_ioq) {
+               DMERR("failed to create workqueue kverityd_ioq");
+               goto bad_io_queue;
+       }
+
+       kveritydq = alloc_workqueue("kverityd", VERITY_WQ_FLAGS, 1);
+       if (!kveritydq) {
+               DMERR("failed to create workqueue kveritydq");
+               goto bad_verify_queue;
+       }
 
        r = dm_register_target(&verity_target);
-       if (r < 0)
+       if (r < 0) {
                DMERR("register failed %d", r);
+               goto register_failed;
+       }
+
+       DMINFO("version %u.%u.%u loaded", verity_target.version[0],
+              verity_target.version[1], verity_target.version[2]);
 
        return r;
+
+register_failed:
+       destroy_workqueue(kveritydq);
+bad_verify_queue:
+       destroy_workqueue(kverityd_ioq);
+bad_io_queue:
+       kmem_cache_destroy(_verity_io_pool);
+bad_io_pool:
+       debugfs_remove_recursive(debugfs_root);
+bad_debugfs_dir:
+       return r;
 }
 
 static void __exit dm_verity_exit(void)
 {
+       destroy_workqueue(kveritydq);
+       destroy_workqueue(kverityd_ioq);
+
        dm_unregister_target(&verity_target);
+       kmem_cache_destroy(_verity_io_pool);
+       debugfs_remove_recursive(debugfs_root);
 }
 
 module_init(dm_verity_init);
 module_exit(dm_verity_exit);
 
-MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
-MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
-MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
+MODULE_AUTHOR("The Chromium OS Authors <chromium-os-dev@chromium.org>");
 MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
new file mode 100644 (file)
index 0000000..18d3d1a
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *                    All Rights Reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Provide error types for use when creating a custom error handler.
+ * See Documentation/device-mapper/dm-verity.txt
+ */
+#ifndef DM_VERITY_H
+#define DM_VERITY_H
+
+#include <linux/notifier.h>
+
+struct dm_verity_error_state {
+       int code;
+       int transient;  /* Likely to not happen after a reboot */
+       u64 block;
+       const char *message;
+
+       sector_t dev_start;
+       sector_t dev_len;
+       struct block_device *dev;
+
+       sector_t hash_dev_start;
+       sector_t hash_dev_len;
+       struct block_device *hash_dev;
+
+       /* Final behavior after all notifications are completed. */
+       int behavior;
+};
+
+/* This enum must be matched to allowed_error_behaviors in dm-verity.c */
+enum dm_verity_error_behavior {
+       DM_VERITY_ERROR_BEHAVIOR_EIO = 0,
+       DM_VERITY_ERROR_BEHAVIOR_PANIC,
+       DM_VERITY_ERROR_BEHAVIOR_NONE,
+       DM_VERITY_ERROR_BEHAVIOR_NOTIFY
+};
+
+
+int dm_verity_register_error_notifier(struct notifier_block *nb);
+int dm_verity_unregister_error_notifier(struct notifier_block *nb);
+
+#endif  /* DM_VERITY_H */
index 98f34b8..4e30939 100644 (file)
@@ -301,6 +301,12 @@ void dm_put(struct mapped_device *md);
 void dm_set_mdptr(struct mapped_device *md, void *ptr);
 void *dm_get_mdptr(struct mapped_device *md);
 
+/*
+ * Export the device via the ioctl interface (uses mdptr).
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+                   const char *uuid);
+
 /*
  * A device can still be used while suspended, but I/O is deferred.
  */
diff --git a/include/linux/dm-bht.h b/include/linux/dm-bht.h
new file mode 100644 (file)
index 0000000..99a9425
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *
+ * Device-Mapper block hash tree interface.
+ * See Documentation/device-mapper/dm-bht.txt for details.
+ *
+ * This file is released under the GPLv2.
+ */
+#ifndef __LINUX_DM_BHT_H
+#define __LINUX_DM_BHT_H
+
+#include <linux/compiler.h>
+#include <linux/crypto.h>
+#include <linux/types.h>
+
+/* To avoid allocating memory for digest tests, we just setup a
+ * max to use for now.
+ */
+#define DM_BHT_MAX_DIGEST_SIZE 128  /* 1k hashes are unlikely for now */
+#define DM_BHT_SALT_SIZE       32   /* 256 bits of salt is a lot */
+
+/* UNALLOCATED, PENDING, READY, and VERIFIED are valid states. All other
+ * values are entry-related return codes.
+ */
+#define DM_BHT_ENTRY_VERIFIED 8  /* 'nodes' has been checked against parent */
+#define DM_BHT_ENTRY_READY 4  /* 'nodes' is loaded and available */
+#define DM_BHT_ENTRY_PENDING 2  /* 'nodes' is being loaded */
+#define DM_BHT_ENTRY_UNALLOCATED 0 /* untouched */
+#define DM_BHT_ENTRY_ERROR -1 /* entry is unsuitable for use */
+#define DM_BHT_ENTRY_ERROR_IO -2 /* I/O error on load */
+
+/* Additional possible return codes */
+#define DM_BHT_ENTRY_ERROR_MISMATCH -3 /* Digest mismatch */
+
+/* dm_bht_entry
+ * Contains dm_bht->node_count tree nodes at a given tree depth.
+ * state is used to transactionally assure that data is paged in
+ * from disk.  Unless dm_bht kept running crypto contexts for each
+ * level, we need to load in the data for on-demand verification.
+ */
+struct dm_bht_entry {
+       atomic_t state; /* see defines */
+       /* Keeping an extra pointer per entry wastes up to ~33k of
+        * memory if a 1m blocks are used (or 66 on 64-bit arch)
+        */
+       void *io_context;  /* Reserve a pointer for use during io */
+       /* data should only be non-NULL if fully populated. */
+       u8 *nodes;  /* The hash data used to verify the children.
+                    * Guaranteed to be page-aligned.
+                    */
+};
+
+/* dm_bht_level
+ * Contains an array of entries which represent a page of hashes where
+ * each hash is a node in the tree at the given tree depth/level.
+ */
+struct dm_bht_level {
+       struct dm_bht_entry *entries;  /* array of entries of tree nodes */
+       unsigned int count;  /* number of entries at this level */
+       sector_t sector;  /* starting sector for this level */
+};
+
+/* opaque context, start, databuf, sector_count */
+typedef int(*dm_bht_callback)(void *,  /* external context */
+                             sector_t,  /* start sector */
+                             u8 *,  /* destination page */
+                             sector_t,  /* num sectors */
+                             struct dm_bht_entry *);
+/* dm_bht - Device mapper block hash tree
+ * dm_bht provides a fixed interface for comparing data blocks
+ * against a cryptographic hashes stored in a hash tree. It
+ * optimizes the tree structure for storage on disk.
+ *
+ * The tree is built from the bottom up.  A collection of data,
+ * external to the tree, is hashed and these hashes are stored
+ * as the blocks in the tree.  For some number of these hashes,
+ * a parent node is created by hashing them.  These steps are
+ * repeated.
+ *
+ * TODO(wad): All hash storage memory is pre-allocated and freed once an
+ * entire branch has been verified.
+ */
+struct dm_bht {
+       /* Configured values */
+       int depth;  /* Depth of the tree including the root */
+       unsigned int block_count;  /* Number of blocks hashed */
+       char hash_alg[CRYPTO_MAX_ALG_NAME];
+       unsigned char salt[DM_BHT_SALT_SIZE];
+
+       /* This is a temporary hack to ease the transition to salting. It will
+        * be removed once salting is supported both in kernel and userspace,
+        * and the salt will default to all zeroes instead. */
+       bool have_salt;
+
+       /* Computed values */
+       unsigned int node_count;  /* Data size (in hashes) for each entry */
+       unsigned int node_count_shift;  /* first bit set - 1 */
+       /* There is one per CPU so that verified can be simultaneous. */
+       struct hash_desc hash_desc[NR_CPUS];  /* Container for the hash alg */
+       unsigned int digest_size;
+       sector_t sectors;  /* Number of disk sectors used */
+
+       /* bool verified;  Full tree is verified */
+       u8 root_digest[DM_BHT_MAX_DIGEST_SIZE];
+       struct dm_bht_level *levels;  /* in reverse order */
+       /* Callbacks for reading and/or writing to the hash device */
+       dm_bht_callback read_cb;
+       dm_bht_callback write_cb;
+};
+
+/* Constructor for struct dm_bht instances. */
+int dm_bht_create(struct dm_bht *bht,
+                 unsigned int block_count,
+                 const char *alg_name);
+/* Destructor for struct dm_bht instances.  Does not free @bht */
+int dm_bht_destroy(struct dm_bht *bht);
+
+/* Basic accessors for struct dm_bht */
+sector_t dm_bht_sectors(const struct dm_bht *bht);
+void dm_bht_set_read_cb(struct dm_bht *bht, dm_bht_callback read_cb);
+void dm_bht_set_write_cb(struct dm_bht *bht, dm_bht_callback write_cb);
+int dm_bht_set_root_hexdigest(struct dm_bht *bht, const u8 *hexdigest);
+int dm_bht_root_hexdigest(struct dm_bht *bht, u8 *hexdigest, int available);
+void dm_bht_set_salt(struct dm_bht *bht, const char *hexsalt);
+int dm_bht_salt(struct dm_bht *bht, char *hexsalt);
+
+/* Functions for loading in data from disk for verification */
+bool dm_bht_is_populated(struct dm_bht *bht, unsigned int block);
+int dm_bht_populate(struct dm_bht *bht, void *read_cb_ctx,
+                   unsigned int block);
+int dm_bht_verify_block(struct dm_bht *bht, unsigned int block,
+                       struct page *pg, unsigned int offset);
+
+/* Functions for creating struct dm_bhts on disk.  A newly created dm_bht
+ * should not be directly used for verification. (It should be repopulated.)
+ * In addition, these functions aren't meant to be called in parallel.
+ */
+int dm_bht_compute(struct dm_bht *bht, void *read_cb_ctx);
+int dm_bht_sync(struct dm_bht *bht, void *write_cb_ctx);
+int dm_bht_store_block(struct dm_bht *bht, unsigned int block,
+                      u8 *block_data);
+int dm_bht_zeroread_callback(void *ctx, sector_t start, u8 *dst, sector_t count,
+                            struct dm_bht_entry *entry);
+void dm_bht_read_completed(struct dm_bht_entry *entry, int status);
+void dm_bht_write_completed(struct dm_bht_entry *entry, int status);
+#endif  /* __LINUX_DM_BHT_H */
index 0bf677a..1677baa 100644 (file)
@@ -14,6 +14,7 @@ mounts-y                      := do_mounts.o
 mounts-$(CONFIG_BLK_DEV_RAM)   += do_mounts_rd.o
 mounts-$(CONFIG_BLK_DEV_INITRD)        += do_mounts_initrd.o
 mounts-$(CONFIG_BLK_DEV_MD)    += do_mounts_md.o
+mounts-$(CONFIG_BLK_DEV_DM)    += do_mounts_dm.o
 
 # dependencies on generated files need to be listed explicitly
 $(obj)/version.o: include/generated/compile.h
index 3f33263..e842840 100644 (file)
@@ -524,6 +524,7 @@ void __init prepare_namespace(void)
        async_synchronize_full();
 
        md_run_setup();
+       dm_run_setup();
 
        if (saved_root_name[0]) {
                root_device_name = saved_root_name;
index f5b978a..09d2286 100644 (file)
@@ -74,3 +74,13 @@ void md_run_setup(void);
 static inline void md_run_setup(void) {}
 
 #endif
+
+#ifdef CONFIG_BLK_DEV_DM
+
+void dm_run_setup(void);
+
+#else
+
+static inline void dm_run_setup(void) {}
+
+#endif
diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
new file mode 100644 (file)
index 0000000..e268549
--- /dev/null
@@ -0,0 +1,350 @@
+/* do_mounts_dm.c
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *                    All Rights Reserved.
+ * Based on do_mounts_md.c
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/async.h>
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+
+#include "do_mounts.h"
+
+#define DM_MAX_NAME 32
+#define DM_MAX_UUID 129
+#define DM_NO_UUID "none"
+
+#define DM_MSG_PREFIX "init"
+
+/* Separators used for parsing the dm= argument. */
+#define DM_FIELD_SEP ' '
+#define DM_LINE_SEP ','
+
+/*
+ * When the device-mapper and any targets are compiled into the kernel
+ * (not a module), one target may be created and used as the root device at
+ * boot time with the parameters given with the boot line dm=...
+ * The code for that is here.
+ */
+
+struct dm_setup_target {
+       sector_t begin;
+       sector_t length;
+       char *type;
+       char *params;
+       /* simple singly linked list */
+       struct dm_setup_target *next;
+};
+
+static struct {
+       int minor;
+       int ro;
+       char name[DM_MAX_NAME];
+       char uuid[DM_MAX_UUID];
+       char *targets;
+       struct dm_setup_target *target;
+       int target_count;
+} dm_setup_args __initdata;
+
+static __initdata int dm_early_setup;
+
+static size_t __init get_dm_option(char *str, char **next, char sep)
+{
+       size_t len = 0;
+       char *endp = NULL;
+
+       if (!str)
+               return 0;
+
+       endp = strchr(str, sep);
+       if (!endp) {  /* act like strchrnul */
+               len = strlen(str);
+               endp = str + len;
+       } else {
+               len = endp - str;
+       }
+
+       if (endp == str)
+               return 0;
+
+       if (!next)
+               return len;
+
+       if (*endp == 0) {
+               /* Don't advance past the nul. */
+               *next = endp;
+       } else {
+               *next = endp + 1;
+       }
+       return len;
+}
+
+static int __init dm_setup_args_init(void)
+{
+       dm_setup_args.minor = 0;
+       dm_setup_args.ro = 0;
+       dm_setup_args.target = NULL;
+       dm_setup_args.target_count = 0;
+       return 0;
+}
+
+static int __init dm_setup_cleanup(void)
+{
+       struct dm_setup_target *target = dm_setup_args.target;
+       struct dm_setup_target *old_target = NULL;
+       while (target) {
+               kfree(target->type);
+               kfree(target->params);
+               old_target = target;
+               target = target->next;
+               kfree(old_target);
+               dm_setup_args.target_count--;
+       }
+       BUG_ON(dm_setup_args.target_count);
+       return 0;
+}
+
+static char * __init dm_setup_parse_device_args(char *str)
+{
+       char *next = NULL;
+       size_t len = 0;
+
+       /* Grab the logical name of the device to be exported to udev */
+       len = get_dm_option(str, &next, DM_FIELD_SEP);
+       if (!len) {
+               DMERR("failed to parse device name");
+               goto parse_fail;
+       }
+       len = min(len + 1, sizeof(dm_setup_args.name));
+       strlcpy(dm_setup_args.name, str, len);  /* includes nul */
+       str = skip_spaces(next);
+
+       /* Grab the UUID value or "none" */
+       len = get_dm_option(str, &next, DM_FIELD_SEP);
+       if (!len) {
+               DMERR("failed to parse device uuid");
+               goto parse_fail;
+       }
+       len = min(len + 1, sizeof(dm_setup_args.uuid));
+       strlcpy(dm_setup_args.uuid, str, len);
+       str = skip_spaces(next);
+
+       /* Determine if the table/device will be read only or read-write */
+       if (!strncmp("ro,", str, 3)) {
+               dm_setup_args.ro = 1;
+       } else if (!strncmp("rw,", str, 3)) {
+               dm_setup_args.ro = 0;
+       } else {
+               DMERR("failed to parse table mode");
+               goto parse_fail;
+       }
+       str = skip_spaces(str + 3);
+
+       return str;
+
+parse_fail:
+       return NULL;
+}
+
+static int __init dm_setup_parse_targets(char *str)
+{
+       char *next = NULL;
+       size_t len = 0;
+       struct dm_setup_target **target = NULL;
+
+       /* Targets are defined as per the table format but with a
+        * comma as a newline separator. */
+       target = &dm_setup_args.target;
+       while (str && *str) {
+               *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL);
+               if (!*target) {
+                       DMERR("failed to allocate memory for target %d",
+                             dm_setup_args.target_count);
+                       goto parse_fail;
+               }
+               dm_setup_args.target_count++;
+
+               (*target)->begin = simple_strtoull(str, &next, 10);
+               if (!next || *next != DM_FIELD_SEP) {
+                       DMERR("failed to parse starting sector for target %d",
+                             dm_setup_args.target_count - 1);
+                       goto parse_fail;
+               }
+               str = skip_spaces(next + 1);
+
+               (*target)->length = simple_strtoull(str, &next, 10);
+               if (!next || *next != DM_FIELD_SEP) {
+                       DMERR("failed to parse length for target %d",
+                             dm_setup_args.target_count - 1);
+                       goto parse_fail;
+               }
+               str = skip_spaces(next + 1);
+
+               len = get_dm_option(str, &next, DM_FIELD_SEP);
+               if (!len ||
+                   !((*target)->type = kstrndup(str, len, GFP_KERNEL))) {
+                       DMERR("failed to parse type for target %d",
+                             dm_setup_args.target_count - 1);
+                       goto parse_fail;
+               }
+               str = skip_spaces(next);
+
+               len = get_dm_option(str, &next, DM_LINE_SEP);
+               if (!len ||
+                   !((*target)->params = kstrndup(str, len, GFP_KERNEL))) {
+                       DMERR("failed to parse params for target %d",
+                             dm_setup_args.target_count - 1);
+                       goto parse_fail;
+               }
+               str = skip_spaces(next);
+               target = &((*target)->next);
+       }
+       DMDEBUG("parsed %d targets", dm_setup_args.target_count);
+
+       return 0;
+
+parse_fail:
+       return 1;
+}
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the DM device now; that is handled by
+ * dm_setup_drive after the low-level disk drivers have initialised.
+ * dm format is as follows:
+ *  dm="name uuid fmode,[table line 1],[table line 2],..."
+ * May be used with root=/dev/dm-0 as it always uses the first dm minor.
+ */
+
+static int __init dm_setup(char *str)
+{
+       dm_setup_args_init();
+
+       str = dm_setup_parse_device_args(str);
+       if (!str) {
+               DMDEBUG("str is NULL");
+               goto parse_fail;
+       }
+
+       /* Target parsing is delayed until we have dynamic memory */
+       dm_setup_args.targets = str;
+
+       printk(KERN_INFO "dm: will configure '%s' on dm-%d\n",
+              dm_setup_args.name, dm_setup_args.minor);
+
+       dm_early_setup = 1;
+       return 1;
+
+parse_fail:
+       printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n");
+       return 0;
+}
+
+
+static void __init dm_setup_drive(void)
+{
+       struct mapped_device *md = NULL;
+       struct dm_table *table = NULL;
+       struct dm_setup_target *target;
+       char *uuid = dm_setup_args.uuid;
+       fmode_t fmode = FMODE_READ;
+
+       /* Finish parsing the targets. */
+       if (dm_setup_parse_targets(dm_setup_args.targets))
+               goto parse_fail;
+
+       if (dm_create(dm_setup_args.minor, &md)) {
+               DMDEBUG("failed to create the device");
+               goto dm_create_fail;
+       }
+       DMDEBUG("created device '%s'", dm_device_name(md));
+
+       /* In addition to flagging the table below, the disk must be
+        * set explicitly ro/rw. */
+       set_disk_ro(dm_disk(md), dm_setup_args.ro);
+
+       if (!dm_setup_args.ro)
+               fmode |= FMODE_WRITE;
+       if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) {
+               DMDEBUG("failed to create the table");
+               goto dm_table_create_fail;
+       }
+
+       target = dm_setup_args.target;
+       while (target) {
+               DMINFO("adding target '%llu %llu %s %s'",
+                      (unsigned long long) target->begin,
+                      (unsigned long long) target->length, target->type,
+                      target->params);
+               if (dm_table_add_target(table, target->type, target->begin,
+                                       target->length, target->params)) {
+                       DMDEBUG("failed to add the target to the table");
+                       goto add_target_fail;
+               }
+               target = target->next;
+       }
+
+       if (dm_table_complete(table)) {
+               DMDEBUG("failed to complete the table");
+               goto table_complete_fail;
+       }
+
+       /* Suspend the device so that we can bind it to the table. */
+       if (dm_suspend(md, 0)) {
+               DMDEBUG("failed to suspend the device pre-bind");
+               goto suspend_fail;
+       }
+
+       /* Bind the table to the device. This is the only way to associate
+        * md->map with the table and set the disk capacity directly. */
+       if (dm_swap_table(md, table)) {  /* should return NULL. */
+               DMDEBUG("failed to bind the device to the table");
+               goto table_bind_fail;
+       }
+
+       /* Finally, resume and the device should be ready. */
+       if (dm_resume(md)) {
+               DMDEBUG("failed to resume the device");
+               goto resume_fail;
+       }
+
+       /* Export the dm device via the ioctl interface */
+       if (!strcmp(DM_NO_UUID, dm_setup_args.uuid))
+               uuid = NULL;
+       if (dm_ioctl_export(md, dm_setup_args.name, uuid)) {
+               DMDEBUG("failed to export device with given name and uuid");
+               goto export_fail;
+       }
+       printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor);
+
+       dm_setup_cleanup();
+       return;
+
+export_fail:
+resume_fail:
+table_bind_fail:
+suspend_fail:
+table_complete_fail:
+add_target_fail:
+       dm_table_put(table);
+dm_table_create_fail:
+       dm_put(md);
+dm_create_fail:
+       dm_setup_cleanup();
+parse_fail:
+       printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n",
+              dm_setup_args.minor, dm_setup_args.name);
+}
+
+__setup("dm=", dm_setup);
+
+void __init dm_run_setup(void)
+{
+       if (!dm_early_setup)
+               return;
+       printk(KERN_INFO "dm: attempting early device configuration.\n");
+       dm_setup_drive();
+}