Btrfs: share the same code for __record_{new,deleted}_ref
[cascardo/linux.git] / fs / btrfs / send.c
index 945d1db..112eb64 100644 (file)
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/radix-tree.h>
-#include <linux/crc32c.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 
 #include "send.h"
 #include "backref.h"
+#include "hash.h"
 #include "locking.h"
 #include "disk-io.h"
 #include "btrfs_inode.h"
@@ -51,15 +51,18 @@ struct fs_path {
                struct {
                        char *start;
                        char *end;
-                       char *prepared;
 
                        char *buf;
-                       int buf_len;
-                       unsigned int reversed:1;
-                       unsigned int virtual_mem:1;
+                       unsigned short buf_len:15;
+                       unsigned short reversed:1;
                        char inline_buf[];
                };
-               char pad[PAGE_SIZE];
+               /*
+                * Average path length does not exceed 200 bytes, we'll have
+                * better packing in the slab and higher chance to satisfy
+                * a allocation later during send.
+                */
+               char pad[256];
        };
 };
 #define FS_PATH_INLINE_SIZE \
@@ -88,8 +91,6 @@ struct send_ctx {
        u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
        u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
 
-       struct vfsmount *mnt;
-
        struct btrfs_root *send_root;
        struct btrfs_root *parent_root;
        struct clone_root *clone_roots;
@@ -111,6 +112,8 @@ struct send_ctx {
        int cur_inode_deleted;
        u64 cur_inode_size;
        u64 cur_inode_mode;
+       u64 cur_inode_rdev;
+       u64 cur_inode_last_extent;
 
        u64 send_progress;
 
@@ -122,6 +125,127 @@ struct send_ctx {
        int name_cache_size;
 
        char *read_buf;
+
+       /*
+        * We process inodes by their increasing order, so if before an
+        * incremental send we reverse the parent/child relationship of
+        * directories such that a directory with a lower inode number was
+        * the parent of a directory with a higher inode number, and the one
+        * becoming the new parent got renamed too, we can't rename/move the
+        * directory with lower inode number when we finish processing it - we
+        * must process the directory with higher inode number first, then
+        * rename/move it and then rename/move the directory with lower inode
+        * number. Example follows.
+        *
+        * Tree state when the first send was performed:
+        *
+        * .
+        * |-- a                   (ino 257)
+        *     |-- b               (ino 258)
+        *         |
+        *         |
+        *         |-- c           (ino 259)
+        *         |   |-- d       (ino 260)
+        *         |
+        *         |-- c2          (ino 261)
+        *
+        * Tree state when the second (incremental) send is performed:
+        *
+        * .
+        * |-- a                   (ino 257)
+        *     |-- b               (ino 258)
+        *         |-- c2          (ino 261)
+        *             |-- d2      (ino 260)
+        *                 |-- cc  (ino 259)
+        *
+        * The sequence of steps that lead to the second state was:
+        *
+        * mv /a/b/c/d /a/b/c2/d2
+        * mv /a/b/c /a/b/c2/d2/cc
+        *
+        * "c" has lower inode number, but we can't move it (2nd mv operation)
+        * before we move "d", which has higher inode number.
+        *
+        * So we just memorize which move/rename operations must be performed
+        * later when their respective parent is processed and moved/renamed.
+        */
+
+       /* Indexed by parent directory inode number. */
+       struct rb_root pending_dir_moves;
+
+       /*
+        * Reverse index, indexed by the inode number of a directory that
+        * is waiting for the move/rename of its immediate parent before its
+        * own move/rename can be performed.
+        */
+       struct rb_root waiting_dir_moves;
+
+       /*
+        * A directory that is going to be rm'ed might have a child directory
+        * which is in the pending directory moves index above. In this case,
+        * the directory can only be removed after the move/rename of its child
+        * is performed. Example:
+        *
+        * Parent snapshot:
+        *
+        * .                        (ino 256)
+        * |-- a/                   (ino 257)
+        *     |-- b/               (ino 258)
+        *         |-- c/           (ino 259)
+        *         |   |-- x/       (ino 260)
+        *         |
+        *         |-- y/           (ino 261)
+        *
+        * Send snapshot:
+        *
+        * .                        (ino 256)
+        * |-- a/                   (ino 257)
+        *     |-- b/               (ino 258)
+        *         |-- YY/          (ino 261)
+        *              |-- x/      (ino 260)
+        *
+        * Sequence of steps that lead to the send snapshot:
+        * rm -f /a/b/c/foo.txt
+        * mv /a/b/y /a/b/YY
+        * mv /a/b/c/x /a/b/YY
+        * rmdir /a/b/c
+        *
+        * When the child is processed, its move/rename is delayed until its
+        * parent is processed (as explained above), but all other operations
+        * like update utimes, chown, chgrp, etc, are performed and the paths
+        * that it uses for those operations must use the orphanized name of
+        * its parent (the directory we're going to rm later), so we need to
+        * memorize that name.
+        *
+        * Indexed by the inode number of the directory to be deleted.
+        */
+       struct rb_root orphan_dirs;
+};
+
+struct pending_dir_move {
+       struct rb_node node;
+       struct list_head list;
+       u64 parent_ino;
+       u64 ino;
+       u64 gen;
+       struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+       struct rb_node node;
+       u64 ino;
+       /*
+        * There might be some directory that could not be removed because it
+        * was waiting for this directory inode to be moved first. Therefore
+        * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+        */
+       u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+       struct rb_node node;
+       u64 ino;
+       u64 gen;
 };
 
 struct name_cache_entry {
@@ -145,6 +269,20 @@ struct name_cache_entry {
        char name[];
 };
 
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
+static int need_send_hole(struct send_ctx *sctx)
+{
+       return (sctx->parent_root && !sctx->cur_inode_new &&
+               !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+               S_ISREG(sctx->cur_inode_mode));
+}
+
 static void fs_path_reset(struct fs_path *p)
 {
        if (p->reversed) {
@@ -166,7 +304,6 @@ static struct fs_path *fs_path_alloc(void)
        if (!p)
                return NULL;
        p->reversed = 0;
-       p->virtual_mem = 0;
        p->buf = p->inline_buf;
        p->buf_len = FS_PATH_INLINE_SIZE;
        fs_path_reset(p);
@@ -189,12 +326,8 @@ static void fs_path_free(struct fs_path *p)
 {
        if (!p)
                return;
-       if (p->buf != p->inline_buf) {
-               if (p->virtual_mem)
-                       vfree(p->buf);
-               else
-                       kfree(p->buf);
-       }
+       if (p->buf != p->inline_buf)
+               kfree(p->buf);
        kfree(p);
 }
 
@@ -216,40 +349,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 
        path_len = p->end - p->start;
        old_buf_len = p->buf_len;
-       len = PAGE_ALIGN(len);
-
-       if (p->buf == p->inline_buf) {
-               tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
-               if (!tmp_buf) {
-                       tmp_buf = vmalloc(len);
-                       if (!tmp_buf)
-                               return -ENOMEM;
-                       p->virtual_mem = 1;
-               }
-               memcpy(tmp_buf, p->buf, p->buf_len);
-               p->buf = tmp_buf;
-               p->buf_len = len;
-       } else {
-               if (p->virtual_mem) {
-                       tmp_buf = vmalloc(len);
-                       if (!tmp_buf)
-                               return -ENOMEM;
-                       memcpy(tmp_buf, p->buf, p->buf_len);
-                       vfree(p->buf);
-               } else {
-                       tmp_buf = krealloc(p->buf, len, GFP_NOFS);
-                       if (!tmp_buf) {
-                               tmp_buf = vmalloc(len);
-                               if (!tmp_buf)
-                                       return -ENOMEM;
-                               memcpy(tmp_buf, p->buf, p->buf_len);
-                               kfree(p->buf);
-                               p->virtual_mem = 1;
-                       }
-               }
-               p->buf = tmp_buf;
-               p->buf_len = len;
-       }
+
+       /*
+        * First time the inline_buf does not suffice
+        */
+       if (p->buf == p->inline_buf)
+               tmp_buf = kmalloc(len, GFP_NOFS);
+       else
+               tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+       if (!tmp_buf)
+               return -ENOMEM;
+       p->buf = tmp_buf;
+       /*
+        * The real size of the buffer is bigger, this will let the fast path
+        * happen most of the time
+        */
+       p->buf_len = ksize(p->buf);
+
        if (p->reversed) {
                tmp_buf = p->buf + old_buf_len - path_len - 1;
                p->end = p->buf + p->buf_len - 1;
@@ -262,7 +378,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
        return 0;
 }
 
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+                                  char **prepared)
 {
        int ret;
        int new_len;
@@ -278,11 +395,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
                if (p->start != p->end)
                        *--p->start = '/';
                p->start -= name_len;
-               p->prepared = p->start;
+               *prepared = p->start;
        } else {
                if (p->start != p->end)
                        *p->end++ = '/';
-               p->prepared = p->end;
+               *prepared = p->end;
                p->end += name_len;
                *p->end = 0;
        }
@@ -294,12 +411,12 @@ out:
 static int fs_path_add(struct fs_path *p, const char *name, int name_len)
 {
        int ret;
+       char *prepared;
 
-       ret = fs_path_prepare_for_add(p, name_len);
+       ret = fs_path_prepare_for_add(p, name_len, &prepared);
        if (ret < 0)
                goto out;
-       memcpy(p->prepared, name, name_len);
-       p->prepared = NULL;
+       memcpy(prepared, name, name_len);
 
 out:
        return ret;
@@ -308,12 +425,12 @@ out:
 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
 {
        int ret;
+       char *prepared;
 
-       ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+       ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
        if (ret < 0)
                goto out;
-       memcpy(p->prepared, p2->start, p2->end - p2->start);
-       p->prepared = NULL;
+       memcpy(prepared, p2->start, p2->end - p2->start);
 
 out:
        return ret;
@@ -324,28 +441,18 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
                                          unsigned long off, int len)
 {
        int ret;
+       char *prepared;
 
-       ret = fs_path_prepare_for_add(p, len);
+       ret = fs_path_prepare_for_add(p, len, &prepared);
        if (ret < 0)
                goto out;
 
-       read_extent_buffer(eb, p->prepared, off, len);
-       p->prepared = NULL;
+       read_extent_buffer(eb, prepared, off, len);
 
 out:
        return ret;
 }
 
-#if 0
-static void fs_path_remove(struct fs_path *p)
-{
-       BUG_ON(p->reversed);
-       while (p->start != p->end && *p->end != '/')
-               p->end--;
-       *p->end = 0;
-}
-#endif
-
 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 {
        int ret;
@@ -436,30 +543,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
        return 0;
 }
 
-#if 0
-static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value)
-{
-       return tlv_put(sctx, attr, &value, sizeof(value));
-}
-
-static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value)
-{
-       __le16 tmp = cpu_to_le16(value);
-       return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-
-static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
-{
-       __le32 tmp = cpu_to_le32(value);
-       return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-#endif
+#define TLV_PUT_DEFINE_INT(bits) \
+       static int tlv_put_u##bits(struct send_ctx *sctx,               \
+                       u##bits attr, u##bits value)                    \
+       {                                                               \
+               __le##bits __tmp = cpu_to_le##bits(value);              \
+               return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));      \
+       }
 
-static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value)
-{
-       __le64 tmp = cpu_to_le64(value);
-       return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
+TLV_PUT_DEFINE_INT(64)
 
 static int tlv_put_string(struct send_ctx *sctx, u16 attr,
                          const char *str, int len)
@@ -475,17 +567,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
        return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
 }
 
-#if 0
-static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
-                           struct timespec *ts)
-{
-       struct btrfs_timespec bts;
-       bts.sec = cpu_to_le64(ts->tv_sec);
-       bts.nsec = cpu_to_le32(ts->tv_nsec);
-       return tlv_put(sctx, attr, &bts, sizeof(bts));
-}
-#endif
-
 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
                                  struct extent_buffer *eb,
                                  struct btrfs_timespec *ts)
@@ -533,12 +614,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
                if (ret < 0) \
                        goto tlv_put_failure; \
        } while (0)
-#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
-       do { \
-               ret = tlv_put_timespec(sctx, attrtype, ts); \
-               if (ret < 0) \
-                       goto tlv_put_failure; \
-       } while (0)
 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
        do { \
                ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
@@ -586,7 +661,7 @@ static int send_cmd(struct send_ctx *sctx)
        hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
        hdr->crc = 0;
 
-       crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+       crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
        hdr->crc = cpu_to_le32(crc);
 
        ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -881,9 +956,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        struct btrfs_dir_item *di;
        struct btrfs_key di_key;
        char *buf = NULL;
-       char *buf2 = NULL;
-       int buf_len;
-       int buf_virtual = 0;
+       const int buf_len = PATH_MAX;
        u32 name_len;
        u32 data_len;
        u32 cur;
@@ -893,7 +966,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        int num;
        u8 type;
 
-       buf_len = PAGE_SIZE;
        buf = kmalloc(buf_len, GFP_NOFS);
        if (!buf) {
                ret = -ENOMEM;
@@ -915,30 +987,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                type = btrfs_dir_type(eb, di);
                btrfs_dir_item_key_to_cpu(eb, di, &di_key);
 
+               /*
+                * Path too long
+                */
                if (name_len + data_len > buf_len) {
-                       buf_len = PAGE_ALIGN(name_len + data_len);
-                       if (buf_virtual) {
-                               buf2 = vmalloc(buf_len);
-                               if (!buf2) {
-                                       ret = -ENOMEM;
-                                       goto out;
-                               }
-                               vfree(buf);
-                       } else {
-                               buf2 = krealloc(buf, buf_len, GFP_NOFS);
-                               if (!buf2) {
-                                       buf2 = vmalloc(buf_len);
-                                       if (!buf2) {
-                                               ret = -ENOMEM;
-                                               goto out;
-                                       }
-                                       kfree(buf);
-                                       buf_virtual = 1;
-                               }
-                       }
-
-                       buf = buf2;
-                       buf2 = NULL;
+                       ret = -ENAMETOOLONG;
+                       goto out;
                }
 
                read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -961,10 +1015,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        }
 
 out:
-       if (buf_virtual)
-               vfree(buf);
-       else
-               kfree(buf);
+       kfree(buf);
        return ret;
 }
 
@@ -1258,8 +1309,6 @@ static int find_extent_clone(struct send_ctx *sctx,
                extent_item_pos = logical - found_key.objectid;
        else
                extent_item_pos = 0;
-
-       extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(sctx->send_root->fs_info,
                                        found_key.objectid, extent_item_pos, 1,
                                        __iterate_backrefs, backref_ctx);
@@ -1270,7 +1319,7 @@ static int find_extent_clone(struct send_ctx *sctx,
        if (!backref_ctx->found_itself) {
                /* found a bug in backref code? */
                ret = -EIO;
-               printk(KERN_ERR "btrfs: ERROR did not find backref in "
+               btrfs_err(sctx->send_root->fs_info, "did not find backref in "
                                "send_root. inode=%llu, offset=%llu, "
                                "disk_byte=%llu found extent=%llu\n",
                                ino, data_offset, disk_byte, found_key.objectid);
@@ -1298,6 +1347,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
        }
 
        if (cur_clone_root) {
+               if (compressed != BTRFS_COMPRESS_NONE) {
+                       /*
+                        * Offsets given by iterate_extent_inodes() are relative
+                        * to the start of the extent, we need to add logical
+                        * offset from the file extent item.
+                        * (See why at backref.c:check_extent_in_eb())
+                        */
+                       cur_clone_root->offset += btrfs_file_extent_offset(eb,
+                                                                          fi);
+               }
                *found = cur_clone_root;
                ret = 0;
        } else {
@@ -1343,7 +1402,7 @@ static int read_symlink(struct btrfs_root *root,
        BUG_ON(compression);
 
        off = btrfs_file_extent_inline_start(ei);
-       len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+       len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
 
        ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
 
@@ -1372,13 +1431,9 @@ static int gen_unique_name(struct send_ctx *sctx,
                return -ENOMEM;
 
        while (1) {
-               len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
+               len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
                                ino, gen, idx);
-               if (len >= sizeof(tmp)) {
-                       /* should really not happen */
-                       ret = -EOVERFLOW;
-                       goto out;
-               }
+               ASSERT(len < sizeof(tmp));
 
                di = btrfs_lookup_dir_item(NULL, sctx->send_root,
                                path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1854,13 +1909,20 @@ static void name_cache_delete(struct send_ctx *sctx,
 
        nce_head = radix_tree_lookup(&sctx->name_cache,
                        (unsigned long)nce->ino);
-       BUG_ON(!nce_head);
+       if (!nce_head) {
+               btrfs_err(sctx->send_root->fs_info,
+             "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+                       nce->ino, sctx->name_cache_size);
+       }
 
        list_del(&nce->radix_list);
        list_del(&nce->list);
        sctx->name_cache_size--;
 
-       if (list_empty(nce_head)) {
+       /*
+        * We may not get to the final release of nce_head if the lookup fails
+        */
+       if (nce_head && list_empty(nce_head)) {
                radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
                kfree(nce_head);
        }
@@ -2094,12 +2156,27 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
                fs_path_reset(name);
 
-               ret = __get_cur_name_and_parent(sctx, ino, gen,
-                               &parent_inode, &parent_gen, name);
+               if (is_waiting_for_rm(sctx, ino)) {
+                       ret = gen_unique_name(sctx, ino, gen, name);
+                       if (ret < 0)
+                               goto out;
+                       ret = fs_path_add_path(dest, name);
+                       break;
+               }
+
+               if (is_waiting_for_move(sctx, ino)) {
+                       ret = get_first_ref(sctx->parent_root, ino,
+                                           &parent_inode, &parent_gen, name);
+               } else {
+                       ret = __get_cur_name_and_parent(sctx, ino, gen,
+                                                       &parent_inode,
+                                                       &parent_gen, name);
+                       if (ret)
+                               stop = 1;
+               }
+
                if (ret < 0)
                        goto out;
-               if (ret)
-                       stop = 1;
 
                ret = fs_path_add_path(dest, name);
                if (ret < 0)
@@ -2131,7 +2208,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
        char *name = NULL;
        int namelen;
 
-       path = alloc_path_for_send();
+       path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
@@ -2180,12 +2257,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
        TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
                        sctx->send_root->root_item.uuid);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
-                       sctx->send_root->root_item.ctransid);
+                   le64_to_cpu(sctx->send_root->root_item.ctransid));
        if (parent_root) {
                TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
                                sctx->parent_root->root_item.uuid);
                TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-                               sctx->parent_root->root_item.ctransid);
+                           le64_to_cpu(sctx->parent_root->root_item.ctransid));
        }
 
        ret = send_cmd(sctx);
@@ -2363,10 +2440,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
        if (!p)
                return -ENOMEM;
 
-       ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
-                       NULL, &rdev);
-       if (ret < 0)
-               goto out;
+       if (ino != sctx->cur_ino) {
+               ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+                                    NULL, NULL, &rdev);
+               if (ret < 0)
+                       goto out;
+       } else {
+               gen = sctx->cur_inode_gen;
+               mode = sctx->cur_inode_mode;
+               rdev = sctx->cur_inode_rdev;
+       }
 
        if (S_ISREG(mode)) {
                cmd = BTRFS_SEND_C_MKFILE;
@@ -2446,17 +2529,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
        key.objectid = dir;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = 0;
+       ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
        while (1) {
-               ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-                               1, 0);
-               if (ret < 0)
-                       goto out;
-               if (!ret) {
-                       eb = path->nodes[0];
-                       slot = path->slots[0];
-                       btrfs_item_key_to_cpu(eb, &found_key, slot);
+               eb = path->nodes[0];
+               slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(sctx->send_root, path);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       continue;
                }
-               if (ret || found_key.objectid != key.objectid ||
+
+               btrfs_item_key_to_cpu(eb, &found_key, slot);
+               if (found_key.objectid != key.objectid ||
                    found_key.type != key.type) {
                        ret = 0;
                        goto out;
@@ -2471,8 +2563,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
                        goto out;
                }
 
-               key.offset = found_key.offset + 1;
-               btrfs_release_path(path);
+               path->slots[0]++;
        }
 
 out:
@@ -2524,7 +2615,7 @@ struct recorded_ref {
  * everything mixed. So we first record all refs and later process them.
  * This function is a helper to record one ref.
  */
-static int record_ref(struct list_head *head, u64 dir,
+static int __record_ref(struct list_head *head, u64 dir,
                      u64 dir_gen, struct fs_path *path)
 {
        struct recorded_ref *ref;
@@ -2610,12 +2701,78 @@ out:
        return ret;
 }
 
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+       struct rb_node **p = &sctx->orphan_dirs.rb_node;
+       struct rb_node *parent = NULL;
+       struct orphan_dir_info *entry, *odi;
+
+       odi = kmalloc(sizeof(*odi), GFP_NOFS);
+       if (!odi)
+               return ERR_PTR(-ENOMEM);
+       odi->ino = dir_ino;
+       odi->gen = 0;
+
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct orphan_dir_info, node);
+               if (dir_ino < entry->ino) {
+                       p = &(*p)->rb_left;
+               } else if (dir_ino > entry->ino) {
+                       p = &(*p)->rb_right;
+               } else {
+                       kfree(odi);
+                       return entry;
+               }
+       }
+
+       rb_link_node(&odi->node, parent, p);
+       rb_insert_color(&odi->node, &sctx->orphan_dirs);
+       return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+       struct rb_node *n = sctx->orphan_dirs.rb_node;
+       struct orphan_dir_info *entry;
+
+       while (n) {
+               entry = rb_entry(n, struct orphan_dir_info, node);
+               if (dir_ino < entry->ino)
+                       n = n->rb_left;
+               else if (dir_ino > entry->ino)
+                       n = n->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+       struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+       return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+                                struct orphan_dir_info *odi)
+{
+       if (!odi)
+               return;
+       rb_erase(&odi->node, &sctx->orphan_dirs);
+       kfree(odi);
+}
+
 /*
  * Returns 1 if a directory can be removed at this point in time.
  * We check this by iterating all dir items and checking if the inode behind
  * the dir item was already processed.
  */
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+                    u64 send_progress)
 {
        int ret = 0;
        struct btrfs_root *root = sctx->parent_root;
@@ -2638,31 +2795,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
        key.objectid = dir;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
        while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (!ret) {
-                       btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                       path->slots[0]);
+               struct waiting_dir_move *dm;
+
+               if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
                }
-               if (ret || found_key.objectid != key.objectid ||
-                   found_key.type != key.type) {
+               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                     path->slots[0]);
+               if (found_key.objectid != key.objectid ||
+                   found_key.type != key.type)
                        break;
-               }
 
                di = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                struct btrfs_dir_item);
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
 
+               dm = get_waiting_dir_move(sctx, loc.objectid);
+               if (dm) {
+                       struct orphan_dir_info *odi;
+
+                       odi = add_orphan_dir_info(sctx, dir);
+                       if (IS_ERR(odi)) {
+                               ret = PTR_ERR(odi);
+                               goto out;
+                       }
+                       odi->gen = dir_gen;
+                       dm->rmdir_ino = dir;
+                       ret = 0;
+                       goto out;
+               }
+
                if (loc.objectid > send_progress) {
                        ret = 0;
                        goto out;
                }
 
-               btrfs_release_path(path);
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
 
        ret = 1;
@@ -2672,59 +2850,457 @@ out:
        return ret;
 }
 
-/*
- * This does all the move/link/unlink/rmdir magic.
- */
-static int process_recorded_refs(struct send_ctx *sctx)
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
 {
-       int ret = 0;
-       struct recorded_ref *cur;
-       struct recorded_ref *cur2;
-       struct list_head check_dirs;
-       struct fs_path *valid_path = NULL;
-       u64 ow_inode = 0;
-       u64 ow_gen;
-       int did_overwrite = 0;
-       int is_orphan = 0;
+       struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
 
-verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+       return entry != NULL;
+}
 
-       /*
-        * This should never happen as the root dir always has the same ref
-        * which is always '..'
-        */
-       BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
-       INIT_LIST_HEAD(&check_dirs);
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+       struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+       struct rb_node *parent = NULL;
+       struct waiting_dir_move *entry, *dm;
 
-       valid_path = fs_path_alloc();
-       if (!valid_path) {
-               ret = -ENOMEM;
-               goto out;
+       dm = kmalloc(sizeof(*dm), GFP_NOFS);
+       if (!dm)
+               return -ENOMEM;
+       dm->ino = ino;
+       dm->rmdir_ino = 0;
+
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct waiting_dir_move, node);
+               if (ino < entry->ino) {
+                       p = &(*p)->rb_left;
+               } else if (ino > entry->ino) {
+                       p = &(*p)->rb_right;
+               } else {
+                       kfree(dm);
+                       return -EEXIST;
+               }
        }
 
-       /*
-        * First, check if the first ref of the current inode was overwritten
-        * before. If yes, we know that the current inode was already orphanized
-        * and thus use the orphan name. If not, we can use get_cur_path to
-        * get the path of the first ref as it would like while receiving at
-        * this point in time.
-        * New inodes are always orphan at the beginning, so force to use the
-        * orphan name in this case.
-        * The first ref is stored in valid_path and will be updated if it
-        * gets moved around.
-        */
-       if (!sctx->cur_inode_new) {
-               ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
-                               sctx->cur_inode_gen);
-               if (ret < 0)
-                       goto out;
-               if (ret)
-                       did_overwrite = 1;
+       rb_link_node(&dm->node, parent, p);
+       rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+       return 0;
+}
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+       struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+       struct waiting_dir_move *entry;
+
+       while (n) {
+               entry = rb_entry(n, struct waiting_dir_move, node);
+               if (ino < entry->ino)
+                       n = n->rb_left;
+               else if (ino > entry->ino)
+                       n = n->rb_right;
+               else
+                       return entry;
        }
-       if (sctx->cur_inode_new || did_overwrite) {
-               ret = gen_unique_name(sctx, sctx->cur_ino,
-                               sctx->cur_inode_gen, valid_path);
-               if (ret < 0)
+       return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+                                 struct waiting_dir_move *dm)
+{
+       if (!dm)
+               return;
+       rb_erase(&dm->node, &sctx->waiting_dir_moves);
+       kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+{
+       struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+       struct rb_node *parent = NULL;
+       struct pending_dir_move *entry, *pm;
+       struct recorded_ref *cur;
+       int exists = 0;
+       int ret;
+
+       pm = kmalloc(sizeof(*pm), GFP_NOFS);
+       if (!pm)
+               return -ENOMEM;
+       pm->parent_ino = parent_ino;
+       pm->ino = sctx->cur_ino;
+       pm->gen = sctx->cur_inode_gen;
+       INIT_LIST_HEAD(&pm->list);
+       INIT_LIST_HEAD(&pm->update_refs);
+       RB_CLEAR_NODE(&pm->node);
+
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct pending_dir_move, node);
+               if (parent_ino < entry->parent_ino) {
+                       p = &(*p)->rb_left;
+               } else if (parent_ino > entry->parent_ino) {
+                       p = &(*p)->rb_right;
+               } else {
+                       exists = 1;
+                       break;
+               }
+       }
+
+       list_for_each_entry(cur, &sctx->deleted_refs, list) {
+               ret = dup_ref(cur, &pm->update_refs);
+               if (ret < 0)
+                       goto out;
+       }
+       list_for_each_entry(cur, &sctx->new_refs, list) {
+               ret = dup_ref(cur, &pm->update_refs);
+               if (ret < 0)
+                       goto out;
+       }
+
+       ret = add_waiting_dir_move(sctx, pm->ino);
+       if (ret)
+               goto out;
+
+       if (exists) {
+               list_add_tail(&pm->list, &entry->list);
+       } else {
+               rb_link_node(&pm->node, parent, p);
+               rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+       }
+       ret = 0;
+out:
+       if (ret) {
+               __free_recorded_refs(&pm->update_refs);
+               kfree(pm);
+       }
+       return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+                                                     u64 parent_ino)
+{
+       struct rb_node *n = sctx->pending_dir_moves.rb_node;
+       struct pending_dir_move *entry;
+
+       while (n) {
+               entry = rb_entry(n, struct pending_dir_move, node);
+               if (parent_ino < entry->parent_ino)
+                       n = n->rb_left;
+               else if (parent_ino > entry->parent_ino)
+                       n = n->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+       struct fs_path *from_path = NULL;
+       struct fs_path *to_path = NULL;
+       struct fs_path *name = NULL;
+       u64 orig_progress = sctx->send_progress;
+       struct recorded_ref *cur;
+       u64 parent_ino, parent_gen;
+       struct waiting_dir_move *dm = NULL;
+       u64 rmdir_ino = 0;
+       int ret;
+
+       name = fs_path_alloc();
+       from_path = fs_path_alloc();
+       if (!name || !from_path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       dm = get_waiting_dir_move(sctx, pm->ino);
+       ASSERT(dm);
+       rmdir_ino = dm->rmdir_ino;
+       free_waiting_dir_move(sctx, dm);
+
+       ret = get_first_ref(sctx->parent_root, pm->ino,
+                           &parent_ino, &parent_gen, name);
+       if (ret < 0)
+               goto out;
+
+       if (parent_ino == sctx->cur_ino) {
+               /* child only renamed, not moved */
+               ASSERT(parent_gen == sctx->cur_inode_gen);
+               ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+                                  from_path);
+               if (ret < 0)
+                       goto out;
+               ret = fs_path_add_path(from_path, name);
+               if (ret < 0)
+                       goto out;
+       } else {
+               /* child moved and maybe renamed too */
+               sctx->send_progress = pm->ino;
+               ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+               if (ret < 0)
+                       goto out;
+       }
+
+       fs_path_free(name);
+       name = NULL;
+
+       to_path = fs_path_alloc();
+       if (!to_path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       sctx->send_progress = sctx->cur_ino + 1;
+       ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+       if (ret < 0)
+               goto out;
+
+       ret = send_rename(sctx, from_path, to_path);
+       if (ret < 0)
+               goto out;
+
+       if (rmdir_ino) {
+               struct orphan_dir_info *odi;
+
+               odi = get_orphan_dir_info(sctx, rmdir_ino);
+               if (!odi) {
+                       /* already deleted */
+                       goto finish;
+               }
+               ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+               if (ret < 0)
+                       goto out;
+               if (!ret)
+                       goto finish;
+
+               name = fs_path_alloc();
+               if (!name) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+               if (ret < 0)
+                       goto out;
+               ret = send_rmdir(sctx, name);
+               if (ret < 0)
+                       goto out;
+               free_orphan_dir_info(sctx, odi);
+       }
+
+finish:
+       ret = send_utimes(sctx, pm->ino, pm->gen);
+       if (ret < 0)
+               goto out;
+
+       /*
+        * After rename/move, need to update the utimes of both new parent(s)
+        * and old parent(s).
+        */
+       list_for_each_entry(cur, &pm->update_refs, list) {
+               if (cur->dir == rmdir_ino)
+                       continue;
+               ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+               if (ret < 0)
+                       goto out;
+       }
+
+out:
+       fs_path_free(name);
+       fs_path_free(from_path);
+       fs_path_free(to_path);
+       sctx->send_progress = orig_progress;
+
+       return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+       if (!list_empty(&m->list))
+               list_del(&m->list);
+       if (!RB_EMPTY_NODE(&m->node))
+               rb_erase(&m->node, &sctx->pending_dir_moves);
+       __free_recorded_refs(&m->update_refs);
+       kfree(m);
+}
+
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+                                     struct list_head *stack)
+{
+       if (list_empty(&moves->list)) {
+               list_add_tail(&moves->list, stack);
+       } else {
+               LIST_HEAD(list);
+               list_splice_init(&moves->list, &list);
+               list_add_tail(&moves->list, stack);
+               list_splice_tail(&list, stack);
+       }
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+       struct pending_dir_move *pm;
+       struct list_head stack;
+       u64 parent_ino = sctx->cur_ino;
+       int ret = 0;
+
+       pm = get_pending_dir_moves(sctx, parent_ino);
+       if (!pm)
+               return 0;
+
+       INIT_LIST_HEAD(&stack);
+       tail_append_pending_moves(pm, &stack);
+
+       while (!list_empty(&stack)) {
+               pm = list_first_entry(&stack, struct pending_dir_move, list);
+               parent_ino = pm->ino;
+               ret = apply_dir_move(sctx, pm);
+               free_pending_move(sctx, pm);
+               if (ret)
+                       goto out;
+               pm = get_pending_dir_moves(sctx, parent_ino);
+               if (pm)
+                       tail_append_pending_moves(pm, &stack);
+       }
+       return 0;
+
+out:
+       while (!list_empty(&stack)) {
+               pm = list_first_entry(&stack, struct pending_dir_move, list);
+               free_pending_move(sctx, pm);
+       }
+       return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+                               struct recorded_ref *parent_ref)
+{
+       int ret;
+       u64 ino = parent_ref->dir;
+       u64 parent_ino_before, parent_ino_after;
+       u64 new_gen, old_gen;
+       struct fs_path *path_before = NULL;
+       struct fs_path *path_after = NULL;
+       int len1, len2;
+
+       if (parent_ref->dir <= sctx->cur_ino)
+               return 0;
+
+       if (is_waiting_for_move(sctx, ino))
+               return 1;
+
+       ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
+                            NULL, NULL, NULL, NULL);
+       if (ret == -ENOENT)
+               return 0;
+       else if (ret < 0)
+               return ret;
+
+       ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
+                            NULL, NULL, NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       if (new_gen != old_gen)
+               return 0;
+
+       path_before = fs_path_alloc();
+       if (!path_before)
+               return -ENOMEM;
+
+       ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+                           NULL, path_before);
+       if (ret == -ENOENT) {
+               ret = 0;
+               goto out;
+       } else if (ret < 0) {
+               goto out;
+       }
+
+       path_after = fs_path_alloc();
+       if (!path_after) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+                           NULL, path_after);
+       if (ret == -ENOENT) {
+               ret = 0;
+               goto out;
+       } else if (ret < 0) {
+               goto out;
+       }
+
+       len1 = fs_path_len(path_before);
+       len2 = fs_path_len(path_after);
+       if (parent_ino_before != parent_ino_after || len1 != len2 ||
+            memcmp(path_before->start, path_after->start, len1)) {
+               ret = 1;
+               goto out;
+       }
+       ret = 0;
+
+out:
+       fs_path_free(path_before);
+       fs_path_free(path_after);
+
+       return ret;
+}
+
+/*
+ * This does all the move/link/unlink/rmdir magic.
+ */
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+{
+       int ret = 0;
+       struct recorded_ref *cur;
+       struct recorded_ref *cur2;
+       struct list_head check_dirs;
+       struct fs_path *valid_path = NULL;
+       u64 ow_inode = 0;
+       u64 ow_gen;
+       int did_overwrite = 0;
+       int is_orphan = 0;
+       u64 last_dir_ino_rm = 0;
+
+verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+
+       /*
+        * This should never happen as the root dir always has the same ref
+        * which is always '..'
+        */
+       BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+       INIT_LIST_HEAD(&check_dirs);
+
+       valid_path = fs_path_alloc();
+       if (!valid_path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /*
+        * First, check if the first ref of the current inode was overwritten
+        * before. If yes, we know that the current inode was already orphanized
+        * and thus use the orphan name. If not, we can use get_cur_path to
+        * get the path of the first ref as it would like while receiving at
+        * this point in time.
+        * New inodes are always orphan at the beginning, so force to use the
+        * orphan name in this case.
+        * The first ref is stored in valid_path and will be updated if it
+        * gets moved around.
+        */
+       if (!sctx->cur_inode_new) {
+               ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
+                               sctx->cur_inode_gen);
+               if (ret < 0)
+                       goto out;
+               if (ret)
+                       did_overwrite = 1;
+       }
+       if (sctx->cur_inode_new || did_overwrite) {
+               ret = gen_unique_name(sctx, sctx->cur_ino,
+                               sctx->cur_inode_gen, valid_path);
+               if (ret < 0)
                        goto out;
                is_orphan = 1;
        } else {
@@ -2824,11 +3400,20 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                 * dirs, we always have one new and one deleted
                                 * ref. The deleted ref is ignored later.
                                 */
-                               ret = send_rename(sctx, valid_path,
-                                               cur->full_path);
+                               ret = wait_for_parent_move(sctx, cur);
                                if (ret < 0)
                                        goto out;
-                               ret = fs_path_copy(valid_path, cur->full_path);
+                               if (ret) {
+                                       ret = add_pending_dir_move(sctx,
+                                                                  cur->dir);
+                                       *pending_move = 1;
+                               } else {
+                                       ret = send_rename(sctx, valid_path,
+                                                         cur->full_path);
+                                       if (!ret)
+                                               ret = fs_path_copy(valid_path,
+                                                              cur->full_path);
+                               }
                                if (ret < 0)
                                        goto out;
                        } else {
@@ -2850,7 +3435,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                 * later, we do this check again and rmdir it then if possible.
                 * See the use of check_dirs for more details.
                 */
-               ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+               ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+                               sctx->cur_ino);
                if (ret < 0)
                        goto out;
                if (ret) {
@@ -2941,8 +3527,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        ret = send_utimes(sctx, cur->dir, cur->dir_gen);
                        if (ret < 0)
                                goto out;
-               } else if (ret == inode_state_did_delete) {
-                       ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+               } else if (ret == inode_state_did_delete &&
+                          cur->dir != last_dir_ino_rm) {
+                       ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+                                       sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                        if (ret) {
@@ -2953,6 +3541,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                ret = send_rmdir(sctx, valid_path);
                                if (ret < 0)
                                        goto out;
+                               last_dir_ino_rm = cur->dir;
                        }
                }
        }
@@ -2966,9 +3555,8 @@ out:
        return ret;
 }
 
-static int __record_new_ref(int num, u64 dir, int index,
-                           struct fs_path *name,
-                           void *ctx)
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+                     struct fs_path *name, void *ctx, struct list_head *refs)
 {
        int ret = 0;
        struct send_ctx *sctx = ctx;
@@ -2979,7 +3567,7 @@ static int __record_new_ref(int num, u64 dir, int index,
        if (!p)
                return -ENOMEM;
 
-       ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+       ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
                        NULL, NULL);
        if (ret < 0)
                goto out;
@@ -2991,7 +3579,7 @@ static int __record_new_ref(int num, u64 dir, int index,
        if (ret < 0)
                goto out;
 
-       ret = record_ref(&sctx->new_refs, dir, gen, p);
+       ret = __record_ref(refs, dir, gen, p);
 
 out:
        if (ret)
@@ -2999,37 +3587,23 @@ out:
        return ret;
 }
 
+static int __record_new_ref(int num, u64 dir, int index,
+                           struct fs_path *name,
+                           void *ctx)
+{
+       struct send_ctx *sctx = ctx;
+       return record_ref(sctx->send_root, num, dir, index, name,
+                         ctx, &sctx->new_refs);
+}
+
+
 static int __record_deleted_ref(int num, u64 dir, int index,
                                struct fs_path *name,
                                void *ctx)
 {
-       int ret = 0;
        struct send_ctx *sctx = ctx;
-       struct fs_path *p;
-       u64 gen;
-
-       p = fs_path_alloc();
-       if (!p)
-               return -ENOMEM;
-
-       ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-                       NULL, NULL);
-       if (ret < 0)
-               goto out;
-
-       ret = get_cur_path(sctx, dir, gen, p);
-       if (ret < 0)
-               goto out;
-       ret = fs_path_add_path(p, name);
-       if (ret < 0)
-               goto out;
-
-       ret = record_ref(&sctx->deleted_refs, dir, gen, p);
-
-out:
-       if (ret)
-               fs_path_free(p);
-       return ret;
+       return record_ref(sctx->parent_root, num, dir, index, name,
+                         ctx, &sctx->deleted_refs);
 }
 
 static int record_new_ref(struct send_ctx *sctx)
@@ -3197,6 +3771,7 @@ static int process_all_refs(struct send_ctx *sctx,
        struct extent_buffer *eb;
        int slot;
        iterate_inode_ref_t cb;
+       int pending_move = 0;
 
        path = alloc_path_for_send();
        if (!path)
@@ -3209,21 +3784,31 @@ static int process_all_refs(struct send_ctx *sctx,
                root = sctx->parent_root;
                cb = __record_deleted_ref;
        } else {
-               BUG();
+               btrfs_err(sctx->send_root->fs_info,
+                               "Wrong command %d in process_all_refs", cmd);
+               ret = -EINVAL;
+               goto out;
        }
 
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = 0;
-       while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (ret)
-                       break;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
+       while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
                btrfs_item_key_to_cpu(eb, &found_key, slot);
 
                if (found_key.objectid != key.objectid ||
@@ -3232,15 +3817,16 @@ static int process_all_refs(struct send_ctx *sctx,
                        break;
 
                ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
-               btrfs_release_path(path);
                if (ret < 0)
                        goto out;
 
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
        btrfs_release_path(path);
 
-       ret = process_recorded_refs(sctx);
+       ret = process_recorded_refs(sctx, &pending_move);
+       /* Only applicable to an incremental send. */
+       ASSERT(pending_move == 0);
 
 out:
        btrfs_free_path(path);
@@ -3515,19 +4101,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = 0;
-       while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (ret) {
-                       ret = 0;
-                       goto out;
-               }
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
+       while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
-               btrfs_item_key_to_cpu(eb, &found_key, slot);
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       continue;
+               }
 
+               btrfs_item_key_to_cpu(eb, &found_key, slot);
                if (found_key.objectid != key.objectid ||
                    found_key.type != key.type) {
                        ret = 0;
@@ -3539,8 +4131,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
                if (ret < 0)
                        goto out;
 
-               btrfs_release_path(path);
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
 
 out:
@@ -3706,7 +4297,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
        TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
                        clone_root->root->root_item.uuid);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-                       clone_root->root->root_item.ctransid);
+                   le64_to_cpu(clone_root->root->root_item.ctransid));
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
                        clone_root->offset);
@@ -3752,6 +4343,39 @@ out:
        return ret;
 }
 
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+       struct fs_path *p = NULL;
+       u64 offset = sctx->cur_inode_last_extent;
+       u64 len;
+       int ret = 0;
+
+       p = fs_path_alloc();
+       if (!p)
+               return -ENOMEM;
+       memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+       while (offset < end) {
+               len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+
+               ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+               if (ret < 0)
+                       break;
+               ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+               if (ret < 0)
+                       break;
+               TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+               TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+               TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+               ret = send_cmd(sctx);
+               if (ret < 0)
+                       break;
+               offset += len;
+       }
+tlv_put_failure:
+       fs_path_free(p);
+       return ret;
+}
+
 static int send_write_or_clone(struct send_ctx *sctx,
                               struct btrfs_path *path,
                               struct btrfs_key *key,
@@ -3764,12 +4388,14 @@ static int send_write_or_clone(struct send_ctx *sctx,
        u64 len;
        u32 l;
        u8 type;
+       u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
 
        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                        struct btrfs_file_extent_item);
        type = btrfs_file_extent_type(path->nodes[0], ei);
        if (type == BTRFS_FILE_EXTENT_INLINE) {
-               len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+               len = btrfs_file_extent_inline_len(path->nodes[0],
+                                                  path->slots[0], ei);
                /*
                 * it is possible the inline item won't cover the whole page,
                 * but there may be items after this page.  Make
@@ -3787,7 +4413,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
                goto out;
        }
 
-       if (clone_root) {
+       if (clone_root && IS_ALIGNED(offset + len, bs)) {
                ret = send_clone(sctx, offset, len, clone_root);
        } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
                ret = send_update_extent(sctx, offset, len);
@@ -3979,6 +4605,101 @@ out:
        return ret;
 }
 
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root = sctx->send_root;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 extent_end;
+       u8 type;
+       int ret;
+
+       path = alloc_path_for_send();
+       if (!path)
+               return -ENOMEM;
+
+       sctx->cur_inode_last_extent = 0;
+
+       key.objectid = sctx->cur_ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = offset;
+       ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+       if (ret < 0)
+               goto out;
+       ret = 0;
+       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+       if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+               goto out;
+
+       fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                           struct btrfs_file_extent_item);
+       type = btrfs_file_extent_type(path->nodes[0], fi);
+       if (type == BTRFS_FILE_EXTENT_INLINE) {
+               u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+                                                       path->slots[0], fi);
+               extent_end = ALIGN(key.offset + size,
+                                  sctx->send_root->sectorsize);
+       } else {
+               extent_end = key.offset +
+                       btrfs_file_extent_num_bytes(path->nodes[0], fi);
+       }
+       sctx->cur_inode_last_extent = extent_end;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+                          struct btrfs_key *key)
+{
+       struct btrfs_file_extent_item *fi;
+       u64 extent_end;
+       u8 type;
+       int ret = 0;
+
+       if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+               return 0;
+
+       if (sctx->cur_inode_last_extent == (u64)-1) {
+               ret = get_last_extent(sctx, key->offset - 1);
+               if (ret)
+                       return ret;
+       }
+
+       fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                           struct btrfs_file_extent_item);
+       type = btrfs_file_extent_type(path->nodes[0], fi);
+       if (type == BTRFS_FILE_EXTENT_INLINE) {
+               u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+                                                       path->slots[0], fi);
+               extent_end = ALIGN(key->offset + size,
+                                  sctx->send_root->sectorsize);
+       } else {
+               extent_end = key->offset +
+                       btrfs_file_extent_num_bytes(path->nodes[0], fi);
+       }
+
+       if (path->slots[0] == 0 &&
+           sctx->cur_inode_last_extent < key->offset) {
+               /*
+                * We might have skipped entire leafs that contained only
+                * file extent items for our current inode. These leafs have
+                * a generation number smaller (older) than the one in the
+                * current leaf and the leaf our last extent came from, and
+                * are located between these 2 leafs.
+                */
+               ret = get_last_extent(sctx, key->offset - 1);
+               if (ret)
+                       return ret;
+       }
+
+       if (sctx->cur_inode_last_extent < key->offset)
+               ret = send_hole(sctx, key->offset);
+       sctx->cur_inode_last_extent = extent_end;
+       return ret;
+}
+
 static int process_extent(struct send_ctx *sctx,
                          struct btrfs_path *path,
                          struct btrfs_key *key)
@@ -3995,7 +4716,7 @@ static int process_extent(struct send_ctx *sctx,
                        goto out;
                if (ret) {
                        ret = 0;
-                       goto out;
+                       goto out_hole;
                }
        } else {
                struct btrfs_file_extent_item *ei;
@@ -4031,7 +4752,10 @@ static int process_extent(struct send_ctx *sctx,
                goto out;
 
        ret = send_write_or_clone(sctx, path, key, found_clone);
-
+       if (ret)
+               goto out;
+out_hole:
+       ret = maybe_send_hole(sctx, path, key);
 out:
        return ret;
 }
@@ -4054,17 +4778,25 @@ static int process_all_extents(struct send_ctx *sctx)
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = 0;
-       while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (ret) {
-                       ret = 0;
-                       goto out;
-               }
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
+       while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
+
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       continue;
+               }
+
                btrfs_item_key_to_cpu(eb, &found_key, slot);
 
                if (found_key.objectid != key.objectid ||
@@ -4077,8 +4809,7 @@ static int process_all_extents(struct send_ctx *sctx)
                if (ret < 0)
                        goto out;
 
-               btrfs_release_path(path);
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
 
 out:
@@ -4086,7 +4817,9 @@ out:
        return ret;
 }
 
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+                                          int *pending_move,
+                                          int *refs_processed)
 {
        int ret = 0;
 
@@ -4098,17 +4831,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
        if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
                goto out;
 
-       ret = process_recorded_refs(sctx);
+       ret = process_recorded_refs(sctx, pending_move);
        if (ret < 0)
                goto out;
 
-       /*
-        * We have processed the refs and thus need to advance send_progress.
-        * Now, calls to get_cur_xxx will take the updated refs of the current
-        * inode into account.
-        */
-       sctx->send_progress = sctx->cur_ino + 1;
-
+       *refs_processed = 1;
 out:
        return ret;
 }
@@ -4124,11 +4851,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        u64 right_gid;
        int need_chmod = 0;
        int need_chown = 0;
+       int pending_move = 0;
+       int refs_processed = 0;
 
-       ret = process_recorded_refs_if_needed(sctx, at_end);
+       ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+                                             &refs_processed);
        if (ret < 0)
                goto out;
 
+       /*
+        * We have processed the refs and thus need to advance send_progress.
+        * Now, calls to get_cur_xxx will take the updated refs of the current
+        * inode into account.
+        *
+        * On the other hand, if our current inode is a directory and couldn't
+        * be moved/renamed because its parent was renamed/moved too and it has
+        * a higher inode number, we can only move/rename our current inode
+        * after we moved/renamed its parent. Therefore in this case operate on
+        * the old path (pre move/rename) of our current inode, and the
+        * move/rename will be performed later.
+        */
+       if (refs_processed && !pending_move)
+               sctx->send_progress = sctx->cur_ino + 1;
+
        if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
                goto out;
        if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
@@ -4157,6 +4902,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        }
 
        if (S_ISREG(sctx->cur_inode_mode)) {
+               if (need_send_hole(sctx)) {
+                       if (sctx->cur_inode_last_extent == (u64)-1) {
+                               ret = get_last_extent(sctx, (u64)-1);
+                               if (ret)
+                                       goto out;
+                       }
+                       if (sctx->cur_inode_last_extent <
+                           sctx->cur_inode_size) {
+                               ret = send_hole(sctx, sctx->cur_inode_size);
+                               if (ret)
+                                       goto out;
+                       }
+               }
                ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
                                sctx->cur_inode_size);
                if (ret < 0)
@@ -4177,12 +4935,25 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        }
 
        /*
-        * Need to send that every time, no matter if it actually changed
-        * between the two trees as we have done changes to the inode before.
+        * If other directory inodes depended on our current directory
+        * inode's move/rename, now do their move/rename operations.
         */
-       ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
-       if (ret < 0)
-               goto out;
+       if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+               ret = apply_children_dir_moves(sctx);
+               if (ret)
+                       goto out;
+               /*
+                * Need to send that every time, no matter if it actually
+                * changed between the two trees as we have done changes to
+                * the inode before. If our inode is a directory and it's
+                * waiting to be moved/renamed, we will send its utimes when
+                * it's moved/renamed, therefore we don't need to do it here.
+                */
+               sctx->send_progress = sctx->cur_ino + 1;
+               ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+               if (ret < 0)
+                       goto out;
+       }
 
 out:
        return ret;
@@ -4200,6 +4971,7 @@ static int changed_inode(struct send_ctx *sctx,
 
        sctx->cur_ino = key->objectid;
        sctx->cur_inode_new_gen = 0;
+       sctx->cur_inode_last_extent = (u64)-1;
 
        /*
         * Set send_progress to current inode. This will tell all get_cur_xxx
@@ -4248,6 +5020,8 @@ static int changed_inode(struct send_ctx *sctx,
                                sctx->left_path->nodes[0], left_ii);
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->left_path->nodes[0], left_ii);
+               sctx->cur_inode_rdev = btrfs_inode_rdev(
+                               sctx->left_path->nodes[0], left_ii);
                if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
                        ret = send_create_inode_if_needed(sctx);
        } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4292,6 +5066,8 @@ static int changed_inode(struct send_ctx *sctx,
                                        sctx->left_path->nodes[0], left_ii);
                        sctx->cur_inode_mode = btrfs_inode_mode(
                                        sctx->left_path->nodes[0], left_ii);
+                       sctx->cur_inode_rdev = btrfs_inode_rdev(
+                                       sctx->left_path->nodes[0], left_ii);
                        ret = send_create_inode_if_needed(sctx);
                        if (ret < 0)
                                goto out;
@@ -4480,14 +5256,18 @@ static int changed_cb(struct btrfs_root *left_root,
        struct send_ctx *sctx = ctx;
 
        if (result == BTRFS_COMPARE_TREE_SAME) {
-               if (key->type != BTRFS_INODE_REF_KEY &&
-                   key->type != BTRFS_INODE_EXTREF_KEY)
-                       return 0;
-               ret = compare_refs(sctx, left_path, key);
-               if (!ret)
+               if (key->type == BTRFS_INODE_REF_KEY ||
+                   key->type == BTRFS_INODE_EXTREF_KEY) {
+                       ret = compare_refs(sctx, left_path, key);
+                       if (!ret)
+                               return 0;
+                       if (ret < 0)
+                               return ret;
+               } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+                       return maybe_send_hole(sctx, left_path, key);
+               } else {
                        return 0;
-               if (ret < 0)
-                       return ret;
+               }
                result = BTRFS_COMPARE_TREE_CHANGED;
                ret = 0;
        }
@@ -4566,7 +5346,7 @@ join_trans:
        spin_unlock(&send_root->root_item_lock);
 
        if (ctransid != start_ctransid) {
-               WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
+               WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
                                     "send was modified in between. This is "
                                     "probably a bug.\n");
                ret = -EIO;
@@ -4662,6 +5442,21 @@ out:
        return ret;
 }
 
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+       spin_lock(&root->root_item_lock);
+       root->send_in_progress--;
+       /*
+        * Not much left to do, we don't know why it's unbalanced and
+        * can't blindly reset it to 0.
+        */
+       if (root->send_in_progress < 0)
+               btrfs_err(root->fs_info,
+                       "send_in_progres unbalanced %d root %llu\n",
+                       root->send_in_progress, root->root_key.objectid);
+       spin_unlock(&root->root_item_lock);
+}
+
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 {
        int ret = 0;
@@ -4673,6 +5468,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        struct send_ctx *sctx = NULL;
        u32 i;
        u64 *clone_sources_tmp = NULL;
+       int clone_sources_to_rollback = 0;
+       int sort_clone_roots = 0;
+       int index;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -4680,6 +5478,14 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        send_root = BTRFS_I(file_inode(mnt_file))->root;
        fs_info = send_root->fs_info;
 
+       /*
+        * The subvolume must remain read-only during send, protect against
+        * making it RW.
+        */
+       spin_lock(&send_root->root_item_lock);
+       send_root->send_in_progress++;
+       spin_unlock(&send_root->root_item_lock);
+
        /*
         * This is done when we lookup the root, it should already be complete
         * by the time we get here.
@@ -4687,32 +5493,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
 
        /*
-        * If we just created this root we need to make sure that the orphan
-        * cleanup has been done and committed since we search the commit root,
-        * so check its commit root transid with our otransid and if they match
-        * commit the transaction to make sure everything is updated.
+        * Userspace tools do the checks and warn the user if it's
+        * not RO.
         */
-       down_read(&send_root->fs_info->extent_commit_sem);
-       if (btrfs_header_generation(send_root->commit_root) ==
-           btrfs_root_otransid(&send_root->root_item)) {
-               struct btrfs_trans_handle *trans;
-
-               up_read(&send_root->fs_info->extent_commit_sem);
-
-               trans = btrfs_attach_transaction_barrier(send_root);
-               if (IS_ERR(trans)) {
-                       if (PTR_ERR(trans) != -ENOENT) {
-                               ret = PTR_ERR(trans);
-                               goto out;
-                       }
-                       /* ENOENT means theres no transaction */
-               } else {
-                       ret = btrfs_commit_transaction(trans, send_root);
-                       if (ret)
-                               goto out;
-               }
-       } else {
-               up_read(&send_root->fs_info->extent_commit_sem);
+       if (!btrfs_root_readonly(send_root)) {
+               ret = -EPERM;
+               goto out;
        }
 
        arg = memdup_user(arg_, sizeof(*arg));
@@ -4753,8 +5539,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
 
-       sctx->mnt = mnt_file->f_path.mnt;
-
        sctx->send_root = send_root;
        sctx->clone_roots_cnt = arg->clone_sources_count;
 
@@ -4771,6 +5555,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
 
+       sctx->pending_dir_moves = RB_ROOT;
+       sctx->waiting_dir_moves = RB_ROOT;
+       sctx->orphan_dirs = RB_ROOT;
+
        sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
                        (arg->clone_sources_count + 1));
        if (!sctx->clone_roots) {
@@ -4798,11 +5586,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        key.objectid = clone_sources_tmp[i];
                        key.type = BTRFS_ROOT_ITEM_KEY;
                        key.offset = (u64)-1;
+
+                       index = srcu_read_lock(&fs_info->subvol_srcu);
+
                        clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
                        if (IS_ERR(clone_root)) {
+                               srcu_read_unlock(&fs_info->subvol_srcu, index);
                                ret = PTR_ERR(clone_root);
                                goto out;
                        }
+                       clone_sources_to_rollback = i + 1;
+                       spin_lock(&clone_root->root_item_lock);
+                       clone_root->send_in_progress++;
+                       if (!btrfs_root_readonly(clone_root)) {
+                               spin_unlock(&clone_root->root_item_lock);
+                               srcu_read_unlock(&fs_info->subvol_srcu, index);
+                               ret = -EPERM;
+                               goto out;
+                       }
+                       spin_unlock(&clone_root->root_item_lock);
+                       srcu_read_unlock(&fs_info->subvol_srcu, index);
+
                        sctx->clone_roots[i].root = clone_root;
                }
                vfree(clone_sources_tmp);
@@ -4813,11 +5617,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                key.objectid = arg->parent_root;
                key.type = BTRFS_ROOT_ITEM_KEY;
                key.offset = (u64)-1;
+
+               index = srcu_read_lock(&fs_info->subvol_srcu);
+
                sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
                if (IS_ERR(sctx->parent_root)) {
+                       srcu_read_unlock(&fs_info->subvol_srcu, index);
                        ret = PTR_ERR(sctx->parent_root);
                        goto out;
                }
+
+               spin_lock(&sctx->parent_root->root_item_lock);
+               sctx->parent_root->send_in_progress++;
+               if (!btrfs_root_readonly(sctx->parent_root)) {
+                       spin_unlock(&sctx->parent_root->root_item_lock);
+                       srcu_read_unlock(&fs_info->subvol_srcu, index);
+                       ret = -EPERM;
+                       goto out;
+               }
+               spin_unlock(&sctx->parent_root->root_item_lock);
+
+               srcu_read_unlock(&fs_info->subvol_srcu, index);
        }
 
        /*
@@ -4831,6 +5651,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        sort(sctx->clone_roots, sctx->clone_roots_cnt,
                        sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
                        NULL);
+       sort_clone_roots = 1;
 
        ret = send_subvol(sctx);
        if (ret < 0)
@@ -4846,6 +5667,58 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        }
 
 out:
+       WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+       while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+               struct rb_node *n;
+               struct pending_dir_move *pm;
+
+               n = rb_first(&sctx->pending_dir_moves);
+               pm = rb_entry(n, struct pending_dir_move, node);
+               while (!list_empty(&pm->list)) {
+                       struct pending_dir_move *pm2;
+
+                       pm2 = list_first_entry(&pm->list,
+                                              struct pending_dir_move, list);
+                       free_pending_move(sctx, pm2);
+               }
+               free_pending_move(sctx, pm);
+       }
+
+       WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+       while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+               struct rb_node *n;
+               struct waiting_dir_move *dm;
+
+               n = rb_first(&sctx->waiting_dir_moves);
+               dm = rb_entry(n, struct waiting_dir_move, node);
+               rb_erase(&dm->node, &sctx->waiting_dir_moves);
+               kfree(dm);
+       }
+
+       WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+       while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+               struct rb_node *n;
+               struct orphan_dir_info *odi;
+
+               n = rb_first(&sctx->orphan_dirs);
+               odi = rb_entry(n, struct orphan_dir_info, node);
+               free_orphan_dir_info(sctx, odi);
+       }
+
+       if (sort_clone_roots) {
+               for (i = 0; i < sctx->clone_roots_cnt; i++)
+                       btrfs_root_dec_send_in_progress(
+                                       sctx->clone_roots[i].root);
+       } else {
+               for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+                       btrfs_root_dec_send_in_progress(
+                                       sctx->clone_roots[i].root);
+
+               btrfs_root_dec_send_in_progress(send_root);
+       }
+       if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+               btrfs_root_dec_send_in_progress(sctx->parent_root);
+
        kfree(arg);
        vfree(clone_sources_tmp);