btrfs: Replace fs_info->rmw_workers workqueue with btrfs_workqueue.
[cascardo/linux.git] / fs / btrfs / send.c
index 78a43b2..c2522e4 100644 (file)
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/radix-tree.h>
-#include <linux/crc32c.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 
 #include "send.h"
 #include "backref.h"
+#include "hash.h"
 #include "locking.h"
 #include "disk-io.h"
 #include "btrfs_inode.h"
@@ -51,15 +51,18 @@ struct fs_path {
                struct {
                        char *start;
                        char *end;
-                       char *prepared;
 
                        char *buf;
-                       int buf_len;
-                       unsigned int reversed:1;
-                       unsigned int virtual_mem:1;
+                       unsigned short buf_len:15;
+                       unsigned short reversed:1;
                        char inline_buf[];
                };
-               char pad[PAGE_SIZE];
+               /*
+                * Average path length does not exceed 200 bytes, we'll have
+                * better packing in the slab and higher chance to satisfy
+                * a allocation later during send.
+                */
+               char pad[256];
        };
 };
 #define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
        int cur_inode_deleted;
        u64 cur_inode_size;
        u64 cur_inode_mode;
+       u64 cur_inode_rdev;
        u64 cur_inode_last_extent;
 
        u64 send_progress;
@@ -121,6 +125,127 @@ struct send_ctx {
        int name_cache_size;
 
        char *read_buf;
+
+       /*
+        * We process inodes by their increasing order, so if before an
+        * incremental send we reverse the parent/child relationship of
+        * directories such that a directory with a lower inode number was
+        * the parent of a directory with a higher inode number, and the one
+        * becoming the new parent got renamed too, we can't rename/move the
+        * directory with lower inode number when we finish processing it - we
+        * must process the directory with higher inode number first, then
+        * rename/move it and then rename/move the directory with lower inode
+        * number. Example follows.
+        *
+        * Tree state when the first send was performed:
+        *
+        * .
+        * |-- a                   (ino 257)
+        *     |-- b               (ino 258)
+        *         |
+        *         |
+        *         |-- c           (ino 259)
+        *         |   |-- d       (ino 260)
+        *         |
+        *         |-- c2          (ino 261)
+        *
+        * Tree state when the second (incremental) send is performed:
+        *
+        * .
+        * |-- a                   (ino 257)
+        *     |-- b               (ino 258)
+        *         |-- c2          (ino 261)
+        *             |-- d2      (ino 260)
+        *                 |-- cc  (ino 259)
+        *
+        * The sequence of steps that lead to the second state was:
+        *
+        * mv /a/b/c/d /a/b/c2/d2
+        * mv /a/b/c /a/b/c2/d2/cc
+        *
+        * "c" has lower inode number, but we can't move it (2nd mv operation)
+        * before we move "d", which has higher inode number.
+        *
+        * So we just memorize which move/rename operations must be performed
+        * later when their respective parent is processed and moved/renamed.
+        */
+
+       /* Indexed by parent directory inode number. */
+       struct rb_root pending_dir_moves;
+
+       /*
+        * Reverse index, indexed by the inode number of a directory that
+        * is waiting for the move/rename of its immediate parent before its
+        * own move/rename can be performed.
+        */
+       struct rb_root waiting_dir_moves;
+
+       /*
+        * A directory that is going to be rm'ed might have a child directory
+        * which is in the pending directory moves index above. In this case,
+        * the directory can only be removed after the move/rename of its child
+        * is performed. Example:
+        *
+        * Parent snapshot:
+        *
+        * .                        (ino 256)
+        * |-- a/                   (ino 257)
+        *     |-- b/               (ino 258)
+        *         |-- c/           (ino 259)
+        *         |   |-- x/       (ino 260)
+        *         |
+        *         |-- y/           (ino 261)
+        *
+        * Send snapshot:
+        *
+        * .                        (ino 256)
+        * |-- a/                   (ino 257)
+        *     |-- b/               (ino 258)
+        *         |-- YY/          (ino 261)
+        *              |-- x/      (ino 260)
+        *
+        * Sequence of steps that lead to the send snapshot:
+        * rm -f /a/b/c/foo.txt
+        * mv /a/b/y /a/b/YY
+        * mv /a/b/c/x /a/b/YY
+        * rmdir /a/b/c
+        *
+        * When the child is processed, its move/rename is delayed until its
+        * parent is processed (as explained above), but all other operations
+        * like update utimes, chown, chgrp, etc, are performed and the paths
+        * that it uses for those operations must use the orphanized name of
+        * its parent (the directory we're going to rm later), so we need to
+        * memorize that name.
+        *
+        * Indexed by the inode number of the directory to be deleted.
+        */
+       struct rb_root orphan_dirs;
+};
+
+struct pending_dir_move {
+       struct rb_node node;
+       struct list_head list;
+       u64 parent_ino;
+       u64 ino;
+       u64 gen;
+       struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+       struct rb_node node;
+       u64 ino;
+       /*
+        * There might be some directory that could not be removed because it
+        * was waiting for this directory inode to be moved first. Therefore
+        * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+        */
+       u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+       struct rb_node node;
+       u64 ino;
+       u64 gen;
 };
 
 struct name_cache_entry {
@@ -144,6 +269,13 @@ struct name_cache_entry {
        char name[];
 };
 
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
 static int need_send_hole(struct send_ctx *sctx)
 {
        return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -172,7 +304,6 @@ static struct fs_path *fs_path_alloc(void)
        if (!p)
                return NULL;
        p->reversed = 0;
-       p->virtual_mem = 0;
        p->buf = p->inline_buf;
        p->buf_len = FS_PATH_INLINE_SIZE;
        fs_path_reset(p);
@@ -195,12 +326,8 @@ static void fs_path_free(struct fs_path *p)
 {
        if (!p)
                return;
-       if (p->buf != p->inline_buf) {
-               if (p->virtual_mem)
-                       vfree(p->buf);
-               else
-                       kfree(p->buf);
-       }
+       if (p->buf != p->inline_buf)
+               kfree(p->buf);
        kfree(p);
 }
 
@@ -222,40 +349,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 
        path_len = p->end - p->start;
        old_buf_len = p->buf_len;
-       len = PAGE_ALIGN(len);
-
-       if (p->buf == p->inline_buf) {
-               tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
-               if (!tmp_buf) {
-                       tmp_buf = vmalloc(len);
-                       if (!tmp_buf)
-                               return -ENOMEM;
-                       p->virtual_mem = 1;
-               }
-               memcpy(tmp_buf, p->buf, p->buf_len);
-               p->buf = tmp_buf;
-               p->buf_len = len;
-       } else {
-               if (p->virtual_mem) {
-                       tmp_buf = vmalloc(len);
-                       if (!tmp_buf)
-                               return -ENOMEM;
-                       memcpy(tmp_buf, p->buf, p->buf_len);
-                       vfree(p->buf);
-               } else {
-                       tmp_buf = krealloc(p->buf, len, GFP_NOFS);
-                       if (!tmp_buf) {
-                               tmp_buf = vmalloc(len);
-                               if (!tmp_buf)
-                                       return -ENOMEM;
-                               memcpy(tmp_buf, p->buf, p->buf_len);
-                               kfree(p->buf);
-                               p->virtual_mem = 1;
-                       }
-               }
-               p->buf = tmp_buf;
-               p->buf_len = len;
-       }
+
+       /*
+        * First time the inline_buf does not suffice
+        */
+       if (p->buf == p->inline_buf)
+               tmp_buf = kmalloc(len, GFP_NOFS);
+       else
+               tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+       if (!tmp_buf)
+               return -ENOMEM;
+       p->buf = tmp_buf;
+       /*
+        * The real size of the buffer is bigger, this will let the fast path
+        * happen most of the time
+        */
+       p->buf_len = ksize(p->buf);
+
        if (p->reversed) {
                tmp_buf = p->buf + old_buf_len - path_len - 1;
                p->end = p->buf + p->buf_len - 1;
@@ -268,7 +378,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
        return 0;
 }
 
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+                                  char **prepared)
 {
        int ret;
        int new_len;
@@ -284,11 +395,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
                if (p->start != p->end)
                        *--p->start = '/';
                p->start -= name_len;
-               p->prepared = p->start;
+               *prepared = p->start;
        } else {
                if (p->start != p->end)
                        *p->end++ = '/';
-               p->prepared = p->end;
+               *prepared = p->end;
                p->end += name_len;
                *p->end = 0;
        }
@@ -300,12 +411,12 @@ out:
 static int fs_path_add(struct fs_path *p, const char *name, int name_len)
 {
        int ret;
+       char *prepared;
 
-       ret = fs_path_prepare_for_add(p, name_len);
+       ret = fs_path_prepare_for_add(p, name_len, &prepared);
        if (ret < 0)
                goto out;
-       memcpy(p->prepared, name, name_len);
-       p->prepared = NULL;
+       memcpy(prepared, name, name_len);
 
 out:
        return ret;
@@ -314,12 +425,12 @@ out:
 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
 {
        int ret;
+       char *prepared;
 
-       ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+       ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
        if (ret < 0)
                goto out;
-       memcpy(p->prepared, p2->start, p2->end - p2->start);
-       p->prepared = NULL;
+       memcpy(prepared, p2->start, p2->end - p2->start);
 
 out:
        return ret;
@@ -330,13 +441,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
                                          unsigned long off, int len)
 {
        int ret;
+       char *prepared;
 
-       ret = fs_path_prepare_for_add(p, len);
+       ret = fs_path_prepare_for_add(p, len, &prepared);
        if (ret < 0)
                goto out;
 
-       read_extent_buffer(eb, p->prepared, off, len);
-       p->prepared = NULL;
+       read_extent_buffer(eb, prepared, off, len);
 
 out:
        return ret;
@@ -550,7 +661,7 @@ static int send_cmd(struct send_ctx *sctx)
        hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
        hdr->crc = 0;
 
-       crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+       crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
        hdr->crc = cpu_to_le32(crc);
 
        ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -845,9 +956,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        struct btrfs_dir_item *di;
        struct btrfs_key di_key;
        char *buf = NULL;
-       char *buf2 = NULL;
-       int buf_len;
-       int buf_virtual = 0;
+       const int buf_len = PATH_MAX;
        u32 name_len;
        u32 data_len;
        u32 cur;
@@ -857,7 +966,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        int num;
        u8 type;
 
-       buf_len = PAGE_SIZE;
        buf = kmalloc(buf_len, GFP_NOFS);
        if (!buf) {
                ret = -ENOMEM;
@@ -879,30 +987,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                type = btrfs_dir_type(eb, di);
                btrfs_dir_item_key_to_cpu(eb, di, &di_key);
 
+               /*
+                * Path too long
+                */
                if (name_len + data_len > buf_len) {
-                       buf_len = PAGE_ALIGN(name_len + data_len);
-                       if (buf_virtual) {
-                               buf2 = vmalloc(buf_len);
-                               if (!buf2) {
-                                       ret = -ENOMEM;
-                                       goto out;
-                               }
-                               vfree(buf);
-                       } else {
-                               buf2 = krealloc(buf, buf_len, GFP_NOFS);
-                               if (!buf2) {
-                                       buf2 = vmalloc(buf_len);
-                                       if (!buf2) {
-                                               ret = -ENOMEM;
-                                               goto out;
-                                       }
-                                       kfree(buf);
-                                       buf_virtual = 1;
-                               }
-                       }
-
-                       buf = buf2;
-                       buf2 = NULL;
+                       ret = -ENAMETOOLONG;
+                       goto out;
                }
 
                read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -925,10 +1015,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        }
 
 out:
-       if (buf_virtual)
-               vfree(buf);
-       else
-               kfree(buf);
+       kfree(buf);
        return ret;
 }
 
@@ -1222,8 +1309,6 @@ static int find_extent_clone(struct send_ctx *sctx,
                extent_item_pos = logical - found_key.objectid;
        else
                extent_item_pos = 0;
-
-       extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(sctx->send_root->fs_info,
                                        found_key.objectid, extent_item_pos, 1,
                                        __iterate_backrefs, backref_ctx);
@@ -1234,7 +1319,7 @@ static int find_extent_clone(struct send_ctx *sctx,
        if (!backref_ctx->found_itself) {
                /* found a bug in backref code? */
                ret = -EIO;
-               printk(KERN_ERR "btrfs: ERROR did not find backref in "
+               btrfs_err(sctx->send_root->fs_info, "did not find backref in "
                                "send_root. inode=%llu, offset=%llu, "
                                "disk_byte=%llu found extent=%llu\n",
                                ino, data_offset, disk_byte, found_key.objectid);
@@ -1262,6 +1347,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
        }
 
        if (cur_clone_root) {
+               if (compressed != BTRFS_COMPRESS_NONE) {
+                       /*
+                        * Offsets given by iterate_extent_inodes() are relative
+                        * to the start of the extent, we need to add logical
+                        * offset from the file extent item.
+                        * (See why at backref.c:check_extent_in_eb())
+                        */
+                       cur_clone_root->offset += btrfs_file_extent_offset(eb,
+                                                                          fi);
+               }
                *found = cur_clone_root;
                ret = 0;
        } else {
@@ -1307,7 +1402,7 @@ static int read_symlink(struct btrfs_root *root,
        BUG_ON(compression);
 
        off = btrfs_file_extent_inline_start(ei);
-       len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+       len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
 
        ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
 
@@ -1336,13 +1431,9 @@ static int gen_unique_name(struct send_ctx *sctx,
                return -ENOMEM;
 
        while (1) {
-               len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
+               len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
                                ino, gen, idx);
-               if (len >= sizeof(tmp)) {
-                       /* should really not happen */
-                       ret = -EOVERFLOW;
-                       goto out;
-               }
+               ASSERT(len < sizeof(tmp));
 
                di = btrfs_lookup_dir_item(NULL, sctx->send_root,
                                path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1818,13 +1909,20 @@ static void name_cache_delete(struct send_ctx *sctx,
 
        nce_head = radix_tree_lookup(&sctx->name_cache,
                        (unsigned long)nce->ino);
-       BUG_ON(!nce_head);
+       if (!nce_head) {
+               btrfs_err(sctx->send_root->fs_info,
+             "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+                       nce->ino, sctx->name_cache_size);
+       }
 
        list_del(&nce->radix_list);
        list_del(&nce->list);
        sctx->name_cache_size--;
 
-       if (list_empty(nce_head)) {
+       /*
+        * We may not get to the final release of nce_head if the lookup fails
+        */
+       if (nce_head && list_empty(nce_head)) {
                radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
                kfree(nce_head);
        }
@@ -2058,12 +2156,27 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
                fs_path_reset(name);
 
-               ret = __get_cur_name_and_parent(sctx, ino, gen,
-                               &parent_inode, &parent_gen, name);
+               if (is_waiting_for_rm(sctx, ino)) {
+                       ret = gen_unique_name(sctx, ino, gen, name);
+                       if (ret < 0)
+                               goto out;
+                       ret = fs_path_add_path(dest, name);
+                       break;
+               }
+
+               if (is_waiting_for_move(sctx, ino)) {
+                       ret = get_first_ref(sctx->parent_root, ino,
+                                           &parent_inode, &parent_gen, name);
+               } else {
+                       ret = __get_cur_name_and_parent(sctx, ino, gen,
+                                                       &parent_inode,
+                                                       &parent_gen, name);
+                       if (ret)
+                               stop = 1;
+               }
+
                if (ret < 0)
                        goto out;
-               if (ret)
-                       stop = 1;
 
                ret = fs_path_add_path(dest, name);
                if (ret < 0)
@@ -2095,7 +2208,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
        char *name = NULL;
        int namelen;
 
-       path = alloc_path_for_send();
+       path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
@@ -2327,10 +2440,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
        if (!p)
                return -ENOMEM;
 
-       ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
-                       NULL, &rdev);
-       if (ret < 0)
-               goto out;
+       if (ino != sctx->cur_ino) {
+               ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+                                    NULL, NULL, &rdev);
+               if (ret < 0)
+                       goto out;
+       } else {
+               gen = sctx->cur_inode_gen;
+               mode = sctx->cur_inode_mode;
+               rdev = sctx->cur_inode_rdev;
+       }
 
        if (S_ISREG(mode)) {
                cmd = BTRFS_SEND_C_MKFILE;
@@ -2410,17 +2529,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
        key.objectid = dir;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = 0;
+       ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
        while (1) {
-               ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-                               1, 0);
-               if (ret < 0)
-                       goto out;
-               if (!ret) {
-                       eb = path->nodes[0];
-                       slot = path->slots[0];
-                       btrfs_item_key_to_cpu(eb, &found_key, slot);
+               eb = path->nodes[0];
+               slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(sctx->send_root, path);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       continue;
                }
-               if (ret || found_key.objectid != key.objectid ||
+
+               btrfs_item_key_to_cpu(eb, &found_key, slot);
+               if (found_key.objectid != key.objectid ||
                    found_key.type != key.type) {
                        ret = 0;
                        goto out;
@@ -2435,8 +2563,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
                        goto out;
                }
 
-               key.offset = found_key.offset + 1;
-               btrfs_release_path(path);
+               path->slots[0]++;
        }
 
 out:
@@ -2574,12 +2701,78 @@ out:
        return ret;
 }
 
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+       struct rb_node **p = &sctx->orphan_dirs.rb_node;
+       struct rb_node *parent = NULL;
+       struct orphan_dir_info *entry, *odi;
+
+       odi = kmalloc(sizeof(*odi), GFP_NOFS);
+       if (!odi)
+               return ERR_PTR(-ENOMEM);
+       odi->ino = dir_ino;
+       odi->gen = 0;
+
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct orphan_dir_info, node);
+               if (dir_ino < entry->ino) {
+                       p = &(*p)->rb_left;
+               } else if (dir_ino > entry->ino) {
+                       p = &(*p)->rb_right;
+               } else {
+                       kfree(odi);
+                       return entry;
+               }
+       }
+
+       rb_link_node(&odi->node, parent, p);
+       rb_insert_color(&odi->node, &sctx->orphan_dirs);
+       return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+       struct rb_node *n = sctx->orphan_dirs.rb_node;
+       struct orphan_dir_info *entry;
+
+       while (n) {
+               entry = rb_entry(n, struct orphan_dir_info, node);
+               if (dir_ino < entry->ino)
+                       n = n->rb_left;
+               else if (dir_ino > entry->ino)
+                       n = n->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+       struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+       return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+                                struct orphan_dir_info *odi)
+{
+       if (!odi)
+               return;
+       rb_erase(&odi->node, &sctx->orphan_dirs);
+       kfree(odi);
+}
+
 /*
  * Returns 1 if a directory can be removed at this point in time.
  * We check this by iterating all dir items and checking if the inode behind
  * the dir item was already processed.
  */
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+                    u64 send_progress)
 {
        int ret = 0;
        struct btrfs_root *root = sctx->parent_root;
@@ -2602,31 +2795,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
        key.objectid = dir;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
        while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (!ret) {
-                       btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                       path->slots[0]);
+               struct waiting_dir_move *dm;
+
+               if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
                }
-               if (ret || found_key.objectid != key.objectid ||
-                   found_key.type != key.type) {
+               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                     path->slots[0]);
+               if (found_key.objectid != key.objectid ||
+                   found_key.type != key.type)
                        break;
-               }
 
                di = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                struct btrfs_dir_item);
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
 
+               dm = get_waiting_dir_move(sctx, loc.objectid);
+               if (dm) {
+                       struct orphan_dir_info *odi;
+
+                       odi = add_orphan_dir_info(sctx, dir);
+                       if (IS_ERR(odi)) {
+                               ret = PTR_ERR(odi);
+                               goto out;
+                       }
+                       odi->gen = dir_gen;
+                       dm->rmdir_ino = dir;
+                       ret = 0;
+                       goto out;
+               }
+
                if (loc.objectid > send_progress) {
                        ret = 0;
                        goto out;
                }
 
-               btrfs_release_path(path);
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
 
        ret = 1;
@@ -2636,10 +2850,407 @@ out:
        return ret;
 }
 
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+       struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
+
+       return entry != NULL;
+}
+
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+       struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+       struct rb_node *parent = NULL;
+       struct waiting_dir_move *entry, *dm;
+
+       dm = kmalloc(sizeof(*dm), GFP_NOFS);
+       if (!dm)
+               return -ENOMEM;
+       dm->ino = ino;
+       dm->rmdir_ino = 0;
+
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct waiting_dir_move, node);
+               if (ino < entry->ino) {
+                       p = &(*p)->rb_left;
+               } else if (ino > entry->ino) {
+                       p = &(*p)->rb_right;
+               } else {
+                       kfree(dm);
+                       return -EEXIST;
+               }
+       }
+
+       rb_link_node(&dm->node, parent, p);
+       rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+       return 0;
+}
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+       struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+       struct waiting_dir_move *entry;
+
+       while (n) {
+               entry = rb_entry(n, struct waiting_dir_move, node);
+               if (ino < entry->ino)
+                       n = n->rb_left;
+               else if (ino > entry->ino)
+                       n = n->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+                                 struct waiting_dir_move *dm)
+{
+       if (!dm)
+               return;
+       rb_erase(&dm->node, &sctx->waiting_dir_moves);
+       kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+{
+       struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+       struct rb_node *parent = NULL;
+       struct pending_dir_move *entry, *pm;
+       struct recorded_ref *cur;
+       int exists = 0;
+       int ret;
+
+       pm = kmalloc(sizeof(*pm), GFP_NOFS);
+       if (!pm)
+               return -ENOMEM;
+       pm->parent_ino = parent_ino;
+       pm->ino = sctx->cur_ino;
+       pm->gen = sctx->cur_inode_gen;
+       INIT_LIST_HEAD(&pm->list);
+       INIT_LIST_HEAD(&pm->update_refs);
+       RB_CLEAR_NODE(&pm->node);
+
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct pending_dir_move, node);
+               if (parent_ino < entry->parent_ino) {
+                       p = &(*p)->rb_left;
+               } else if (parent_ino > entry->parent_ino) {
+                       p = &(*p)->rb_right;
+               } else {
+                       exists = 1;
+                       break;
+               }
+       }
+
+       list_for_each_entry(cur, &sctx->deleted_refs, list) {
+               ret = dup_ref(cur, &pm->update_refs);
+               if (ret < 0)
+                       goto out;
+       }
+       list_for_each_entry(cur, &sctx->new_refs, list) {
+               ret = dup_ref(cur, &pm->update_refs);
+               if (ret < 0)
+                       goto out;
+       }
+
+       ret = add_waiting_dir_move(sctx, pm->ino);
+       if (ret)
+               goto out;
+
+       if (exists) {
+               list_add_tail(&pm->list, &entry->list);
+       } else {
+               rb_link_node(&pm->node, parent, p);
+               rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+       }
+       ret = 0;
+out:
+       if (ret) {
+               __free_recorded_refs(&pm->update_refs);
+               kfree(pm);
+       }
+       return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+                                                     u64 parent_ino)
+{
+       struct rb_node *n = sctx->pending_dir_moves.rb_node;
+       struct pending_dir_move *entry;
+
+       while (n) {
+               entry = rb_entry(n, struct pending_dir_move, node);
+               if (parent_ino < entry->parent_ino)
+                       n = n->rb_left;
+               else if (parent_ino > entry->parent_ino)
+                       n = n->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+       struct fs_path *from_path = NULL;
+       struct fs_path *to_path = NULL;
+       struct fs_path *name = NULL;
+       u64 orig_progress = sctx->send_progress;
+       struct recorded_ref *cur;
+       u64 parent_ino, parent_gen;
+       struct waiting_dir_move *dm = NULL;
+       u64 rmdir_ino = 0;
+       int ret;
+
+       name = fs_path_alloc();
+       from_path = fs_path_alloc();
+       if (!name || !from_path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       dm = get_waiting_dir_move(sctx, pm->ino);
+       ASSERT(dm);
+       rmdir_ino = dm->rmdir_ino;
+       free_waiting_dir_move(sctx, dm);
+
+       ret = get_first_ref(sctx->parent_root, pm->ino,
+                           &parent_ino, &parent_gen, name);
+       if (ret < 0)
+               goto out;
+
+       if (parent_ino == sctx->cur_ino) {
+               /* child only renamed, not moved */
+               ASSERT(parent_gen == sctx->cur_inode_gen);
+               ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+                                  from_path);
+               if (ret < 0)
+                       goto out;
+               ret = fs_path_add_path(from_path, name);
+               if (ret < 0)
+                       goto out;
+       } else {
+               /* child moved and maybe renamed too */
+               sctx->send_progress = pm->ino;
+               ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+               if (ret < 0)
+                       goto out;
+       }
+
+       fs_path_free(name);
+       name = NULL;
+
+       to_path = fs_path_alloc();
+       if (!to_path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       sctx->send_progress = sctx->cur_ino + 1;
+       ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+       if (ret < 0)
+               goto out;
+
+       ret = send_rename(sctx, from_path, to_path);
+       if (ret < 0)
+               goto out;
+
+       if (rmdir_ino) {
+               struct orphan_dir_info *odi;
+
+               odi = get_orphan_dir_info(sctx, rmdir_ino);
+               if (!odi) {
+                       /* already deleted */
+                       goto finish;
+               }
+               ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+               if (ret < 0)
+                       goto out;
+               if (!ret)
+                       goto finish;
+
+               name = fs_path_alloc();
+               if (!name) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+               if (ret < 0)
+                       goto out;
+               ret = send_rmdir(sctx, name);
+               if (ret < 0)
+                       goto out;
+               free_orphan_dir_info(sctx, odi);
+       }
+
+finish:
+       ret = send_utimes(sctx, pm->ino, pm->gen);
+       if (ret < 0)
+               goto out;
+
+       /*
+        * After rename/move, need to update the utimes of both new parent(s)
+        * and old parent(s).
+        */
+       list_for_each_entry(cur, &pm->update_refs, list) {
+               if (cur->dir == rmdir_ino)
+                       continue;
+               ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+               if (ret < 0)
+                       goto out;
+       }
+
+out:
+       fs_path_free(name);
+       fs_path_free(from_path);
+       fs_path_free(to_path);
+       sctx->send_progress = orig_progress;
+
+       return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+       if (!list_empty(&m->list))
+               list_del(&m->list);
+       if (!RB_EMPTY_NODE(&m->node))
+               rb_erase(&m->node, &sctx->pending_dir_moves);
+       __free_recorded_refs(&m->update_refs);
+       kfree(m);
+}
+
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+                                     struct list_head *stack)
+{
+       if (list_empty(&moves->list)) {
+               list_add_tail(&moves->list, stack);
+       } else {
+               LIST_HEAD(list);
+               list_splice_init(&moves->list, &list);
+               list_add_tail(&moves->list, stack);
+               list_splice_tail(&list, stack);
+       }
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+       struct pending_dir_move *pm;
+       struct list_head stack;
+       u64 parent_ino = sctx->cur_ino;
+       int ret = 0;
+
+       pm = get_pending_dir_moves(sctx, parent_ino);
+       if (!pm)
+               return 0;
+
+       INIT_LIST_HEAD(&stack);
+       tail_append_pending_moves(pm, &stack);
+
+       while (!list_empty(&stack)) {
+               pm = list_first_entry(&stack, struct pending_dir_move, list);
+               parent_ino = pm->ino;
+               ret = apply_dir_move(sctx, pm);
+               free_pending_move(sctx, pm);
+               if (ret)
+                       goto out;
+               pm = get_pending_dir_moves(sctx, parent_ino);
+               if (pm)
+                       tail_append_pending_moves(pm, &stack);
+       }
+       return 0;
+
+out:
+       while (!list_empty(&stack)) {
+               pm = list_first_entry(&stack, struct pending_dir_move, list);
+               free_pending_move(sctx, pm);
+       }
+       return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+                               struct recorded_ref *parent_ref)
+{
+       int ret;
+       u64 ino = parent_ref->dir;
+       u64 parent_ino_before, parent_ino_after;
+       u64 new_gen, old_gen;
+       struct fs_path *path_before = NULL;
+       struct fs_path *path_after = NULL;
+       int len1, len2;
+
+       if (parent_ref->dir <= sctx->cur_ino)
+               return 0;
+
+       if (is_waiting_for_move(sctx, ino))
+               return 1;
+
+       ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
+                            NULL, NULL, NULL, NULL);
+       if (ret == -ENOENT)
+               return 0;
+       else if (ret < 0)
+               return ret;
+
+       ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
+                            NULL, NULL, NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       if (new_gen != old_gen)
+               return 0;
+
+       path_before = fs_path_alloc();
+       if (!path_before)
+               return -ENOMEM;
+
+       ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+                           NULL, path_before);
+       if (ret == -ENOENT) {
+               ret = 0;
+               goto out;
+       } else if (ret < 0) {
+               goto out;
+       }
+
+       path_after = fs_path_alloc();
+       if (!path_after) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+                           NULL, path_after);
+       if (ret == -ENOENT) {
+               ret = 0;
+               goto out;
+       } else if (ret < 0) {
+               goto out;
+       }
+
+       len1 = fs_path_len(path_before);
+       len2 = fs_path_len(path_after);
+       if (parent_ino_before != parent_ino_after || len1 != len2 ||
+            memcmp(path_before->start, path_after->start, len1)) {
+               ret = 1;
+               goto out;
+       }
+       ret = 0;
+
+out:
+       fs_path_free(path_before);
+       fs_path_free(path_after);
+
+       return ret;
+}
+
 /*
  * This does all the move/link/unlink/rmdir magic.
  */
-static int process_recorded_refs(struct send_ctx *sctx)
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 {
        int ret = 0;
        struct recorded_ref *cur;
@@ -2650,6 +3261,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
        u64 ow_gen;
        int did_overwrite = 0;
        int is_orphan = 0;
+       u64 last_dir_ino_rm = 0;
 
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
@@ -2788,11 +3400,20 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                 * dirs, we always have one new and one deleted
                                 * ref. The deleted ref is ignored later.
                                 */
-                               ret = send_rename(sctx, valid_path,
-                                               cur->full_path);
+                               ret = wait_for_parent_move(sctx, cur);
                                if (ret < 0)
                                        goto out;
-                               ret = fs_path_copy(valid_path, cur->full_path);
+                               if (ret) {
+                                       ret = add_pending_dir_move(sctx,
+                                                                  cur->dir);
+                                       *pending_move = 1;
+                               } else {
+                                       ret = send_rename(sctx, valid_path,
+                                                         cur->full_path);
+                                       if (!ret)
+                                               ret = fs_path_copy(valid_path,
+                                                              cur->full_path);
+                               }
                                if (ret < 0)
                                        goto out;
                        } else {
@@ -2814,7 +3435,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                 * later, we do this check again and rmdir it then if possible.
                 * See the use of check_dirs for more details.
                 */
-               ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+               ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+                               sctx->cur_ino);
                if (ret < 0)
                        goto out;
                if (ret) {
@@ -2905,8 +3527,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        ret = send_utimes(sctx, cur->dir, cur->dir_gen);
                        if (ret < 0)
                                goto out;
-               } else if (ret == inode_state_did_delete) {
-                       ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+               } else if (ret == inode_state_did_delete &&
+                          cur->dir != last_dir_ino_rm) {
+                       ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+                                       sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                        if (ret) {
@@ -2917,6 +3541,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                ret = send_rmdir(sctx, valid_path);
                                if (ret < 0)
                                        goto out;
+                               last_dir_ino_rm = cur->dir;
                        }
                }
        }
@@ -3161,6 +3786,7 @@ static int process_all_refs(struct send_ctx *sctx,
        struct extent_buffer *eb;
        int slot;
        iterate_inode_ref_t cb;
+       int pending_move = 0;
 
        path = alloc_path_for_send();
        if (!path)
@@ -3173,21 +3799,31 @@ static int process_all_refs(struct send_ctx *sctx,
                root = sctx->parent_root;
                cb = __record_deleted_ref;
        } else {
-               BUG();
+               btrfs_err(sctx->send_root->fs_info,
+                               "Wrong command %d in process_all_refs", cmd);
+               ret = -EINVAL;
+               goto out;
        }
 
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = 0;
-       while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (ret)
-                       break;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
+       while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
                btrfs_item_key_to_cpu(eb, &found_key, slot);
 
                if (found_key.objectid != key.objectid ||
@@ -3196,15 +3832,16 @@ static int process_all_refs(struct send_ctx *sctx,
                        break;
 
                ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
-               btrfs_release_path(path);
                if (ret < 0)
                        goto out;
 
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
        btrfs_release_path(path);
 
-       ret = process_recorded_refs(sctx);
+       ret = process_recorded_refs(sctx, &pending_move);
+       /* Only applicable to an incremental send. */
+       ASSERT(pending_move == 0);
 
 out:
        btrfs_free_path(path);
@@ -3479,19 +4116,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = 0;
-       while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (ret) {
-                       ret = 0;
-                       goto out;
-               }
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
+       while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
-               btrfs_item_key_to_cpu(eb, &found_key, slot);
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       continue;
+               }
 
+               btrfs_item_key_to_cpu(eb, &found_key, slot);
                if (found_key.objectid != key.objectid ||
                    found_key.type != key.type) {
                        ret = 0;
@@ -3503,8 +4146,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
                if (ret < 0)
                        goto out;
 
-               btrfs_release_path(path);
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
 
 out:
@@ -3761,12 +4403,14 @@ static int send_write_or_clone(struct send_ctx *sctx,
        u64 len;
        u32 l;
        u8 type;
+       u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
 
        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                        struct btrfs_file_extent_item);
        type = btrfs_file_extent_type(path->nodes[0], ei);
        if (type == BTRFS_FILE_EXTENT_INLINE) {
-               len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+               len = btrfs_file_extent_inline_len(path->nodes[0],
+                                                  path->slots[0], ei);
                /*
                 * it is possible the inline item won't cover the whole page,
                 * but there may be items after this page.  Make
@@ -3784,7 +4428,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
                goto out;
        }
 
-       if (clone_root) {
+       if (clone_root && IS_ALIGNED(offset + len, bs)) {
                ret = send_clone(sctx, offset, len, clone_root);
        } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
                ret = send_update_extent(sctx, offset, len);
@@ -4007,7 +4651,8 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
                            struct btrfs_file_extent_item);
        type = btrfs_file_extent_type(path->nodes[0], fi);
        if (type == BTRFS_FILE_EXTENT_INLINE) {
-               u64 size = btrfs_file_extent_inline_len(path->nodes[0], fi);
+               u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+                                                       path->slots[0], fi);
                extent_end = ALIGN(key.offset + size,
                                   sctx->send_root->sectorsize);
        } else {
@@ -4041,13 +4686,29 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
                            struct btrfs_file_extent_item);
        type = btrfs_file_extent_type(path->nodes[0], fi);
        if (type == BTRFS_FILE_EXTENT_INLINE) {
-               u64 size = btrfs_file_extent_inline_len(path->nodes[0], fi);
+               u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+                                                       path->slots[0], fi);
                extent_end = ALIGN(key->offset + size,
                                   sctx->send_root->sectorsize);
        } else {
                extent_end = key->offset +
                        btrfs_file_extent_num_bytes(path->nodes[0], fi);
        }
+
+       if (path->slots[0] == 0 &&
+           sctx->cur_inode_last_extent < key->offset) {
+               /*
+                * We might have skipped entire leafs that contained only
+                * file extent items for our current inode. These leafs have
+                * a generation number smaller (older) than the one in the
+                * current leaf and the leaf our last extent came from, and
+                * are located between these 2 leafs.
+                */
+               ret = get_last_extent(sctx, key->offset - 1);
+               if (ret)
+                       return ret;
+       }
+
        if (sctx->cur_inode_last_extent < key->offset)
                ret = send_hole(sctx, key->offset);
        sctx->cur_inode_last_extent = extent_end;
@@ -4132,17 +4793,25 @@ static int process_all_extents(struct send_ctx *sctx)
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = 0;
-       while (1) {
-               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0)
-                       goto out;
-               if (ret) {
-                       ret = 0;
-                       goto out;
-               }
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
 
+       while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
+
+               if (slot >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       continue;
+               }
+
                btrfs_item_key_to_cpu(eb, &found_key, slot);
 
                if (found_key.objectid != key.objectid ||
@@ -4155,8 +4824,7 @@ static int process_all_extents(struct send_ctx *sctx)
                if (ret < 0)
                        goto out;
 
-               btrfs_release_path(path);
-               key.offset = found_key.offset + 1;
+               path->slots[0]++;
        }
 
 out:
@@ -4164,7 +4832,9 @@ out:
        return ret;
 }
 
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+                                          int *pending_move,
+                                          int *refs_processed)
 {
        int ret = 0;
 
@@ -4176,17 +4846,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
        if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
                goto out;
 
-       ret = process_recorded_refs(sctx);
+       ret = process_recorded_refs(sctx, pending_move);
        if (ret < 0)
                goto out;
 
-       /*
-        * We have processed the refs and thus need to advance send_progress.
-        * Now, calls to get_cur_xxx will take the updated refs of the current
-        * inode into account.
-        */
-       sctx->send_progress = sctx->cur_ino + 1;
-
+       *refs_processed = 1;
 out:
        return ret;
 }
@@ -4202,11 +4866,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        u64 right_gid;
        int need_chmod = 0;
        int need_chown = 0;
+       int pending_move = 0;
+       int refs_processed = 0;
 
-       ret = process_recorded_refs_if_needed(sctx, at_end);
+       ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+                                             &refs_processed);
        if (ret < 0)
                goto out;
 
+       /*
+        * We have processed the refs and thus need to advance send_progress.
+        * Now, calls to get_cur_xxx will take the updated refs of the current
+        * inode into account.
+        *
+        * On the other hand, if our current inode is a directory and couldn't
+        * be moved/renamed because its parent was renamed/moved too and it has
+        * a higher inode number, we can only move/rename our current inode
+        * after we moved/renamed its parent. Therefore in this case operate on
+        * the old path (pre move/rename) of our current inode, and the
+        * move/rename will be performed later.
+        */
+       if (refs_processed && !pending_move)
+               sctx->send_progress = sctx->cur_ino + 1;
+
        if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
                goto out;
        if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
@@ -4268,9 +4950,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        }
 
        /*
-        * Need to send that every time, no matter if it actually changed
-        * between the two trees as we have done changes to the inode before.
+        * If other directory inodes depended on our current directory
+        * inode's move/rename, now do their move/rename operations.
+        */
+       if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+               ret = apply_children_dir_moves(sctx);
+               if (ret)
+                       goto out;
+       }
+
+       /*
+        * Need to send that every time, no matter if it actually
+        * changed between the two trees as we have done changes to
+        * the inode before.
         */
+       sctx->send_progress = sctx->cur_ino + 1;
        ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
        if (ret < 0)
                goto out;
@@ -4340,6 +5034,8 @@ static int changed_inode(struct send_ctx *sctx,
                                sctx->left_path->nodes[0], left_ii);
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->left_path->nodes[0], left_ii);
+               sctx->cur_inode_rdev = btrfs_inode_rdev(
+                               sctx->left_path->nodes[0], left_ii);
                if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
                        ret = send_create_inode_if_needed(sctx);
        } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4384,6 +5080,8 @@ static int changed_inode(struct send_ctx *sctx,
                                        sctx->left_path->nodes[0], left_ii);
                        sctx->cur_inode_mode = btrfs_inode_mode(
                                        sctx->left_path->nodes[0], left_ii);
+                       sctx->cur_inode_rdev = btrfs_inode_rdev(
+                                       sctx->left_path->nodes[0], left_ii);
                        ret = send_create_inode_if_needed(sctx);
                        if (ret < 0)
                                goto out;
@@ -4618,6 +5316,7 @@ out:
 static int full_send_tree(struct send_ctx *sctx)
 {
        int ret;
+       struct btrfs_trans_handle *trans = NULL;
        struct btrfs_root *send_root = sctx->send_root;
        struct btrfs_key key;
        struct btrfs_key found_key;
@@ -4639,6 +5338,19 @@ static int full_send_tree(struct send_ctx *sctx)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
 
+join_trans:
+       /*
+        * We need to make sure the transaction does not get committed
+        * while we do anything on commit roots. Join a transaction to prevent
+        * this.
+        */
+       trans = btrfs_join_transaction(send_root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               trans = NULL;
+               goto out;
+       }
+
        /*
         * Make sure the tree has not changed after re-joining. We detect this
         * by comparing start_ctransid and ctransid. They should always match.
@@ -4648,7 +5360,7 @@ static int full_send_tree(struct send_ctx *sctx)
        spin_unlock(&send_root->root_item_lock);
 
        if (ctransid != start_ctransid) {
-               WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
+               WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
                                     "send was modified in between. This is "
                                     "probably a bug.\n");
                ret = -EIO;
@@ -4662,6 +5374,19 @@ static int full_send_tree(struct send_ctx *sctx)
                goto out_finish;
 
        while (1) {
+               /*
+                * When someone want to commit while we iterate, end the
+                * joined transaction and rejoin.
+                */
+               if (btrfs_should_end_transaction(trans, send_root)) {
+                       ret = btrfs_end_transaction(trans, send_root);
+                       trans = NULL;
+                       if (ret < 0)
+                               goto out;
+                       btrfs_release_path(path);
+                       goto join_trans;
+               }
+
                eb = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -4689,6 +5414,12 @@ out_finish:
 
 out:
        btrfs_free_path(path);
+       if (trans) {
+               if (!ret)
+                       ret = btrfs_end_transaction(trans, send_root);
+               else
+                       btrfs_end_transaction(trans, send_root);
+       }
        return ret;
 }
 
@@ -4725,6 +5456,21 @@ out:
        return ret;
 }
 
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+       spin_lock(&root->root_item_lock);
+       root->send_in_progress--;
+       /*
+        * Not much left to do, we don't know why it's unbalanced and
+        * can't blindly reset it to 0.
+        */
+       if (root->send_in_progress < 0)
+               btrfs_err(root->fs_info,
+                       "send_in_progres unbalanced %d root %llu\n",
+                       root->send_in_progress, root->root_key.objectid);
+       spin_unlock(&root->root_item_lock);
+}
+
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 {
        int ret = 0;
@@ -4737,6 +5483,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        u32 i;
        u64 *clone_sources_tmp = NULL;
        int clone_sources_to_rollback = 0;
+       int sort_clone_roots = 0;
+       int index;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -4758,35 +5506,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
         */
        WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
 
-       /*
-        * If we just created this root we need to make sure that the orphan
-        * cleanup has been done and committed since we search the commit root,
-        * so check its commit root transid with our otransid and if they match
-        * commit the transaction to make sure everything is updated.
-        */
-       down_read(&send_root->fs_info->extent_commit_sem);
-       if (btrfs_header_generation(send_root->commit_root) ==
-           btrfs_root_otransid(&send_root->root_item)) {
-               struct btrfs_trans_handle *trans;
-
-               up_read(&send_root->fs_info->extent_commit_sem);
-
-               trans = btrfs_attach_transaction_barrier(send_root);
-               if (IS_ERR(trans)) {
-                       if (PTR_ERR(trans) != -ENOENT) {
-                               ret = PTR_ERR(trans);
-                               goto out;
-                       }
-                       /* ENOENT means theres no transaction */
-               } else {
-                       ret = btrfs_commit_transaction(trans, send_root);
-                       if (ret)
-                               goto out;
-               }
-       } else {
-               up_read(&send_root->fs_info->extent_commit_sem);
-       }
-
        /*
         * Userspace tools do the checks and warn the user if it's
         * not RO.
@@ -4850,6 +5569,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
 
+       sctx->pending_dir_moves = RB_ROOT;
+       sctx->waiting_dir_moves = RB_ROOT;
+       sctx->orphan_dirs = RB_ROOT;
+
        sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
                        (arg->clone_sources_count + 1));
        if (!sctx->clone_roots) {
@@ -4877,8 +5600,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        key.objectid = clone_sources_tmp[i];
                        key.type = BTRFS_ROOT_ITEM_KEY;
                        key.offset = (u64)-1;
+
+                       index = srcu_read_lock(&fs_info->subvol_srcu);
+
                        clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
                        if (IS_ERR(clone_root)) {
+                               srcu_read_unlock(&fs_info->subvol_srcu, index);
                                ret = PTR_ERR(clone_root);
                                goto out;
                        }
@@ -4887,10 +5614,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        clone_root->send_in_progress++;
                        if (!btrfs_root_readonly(clone_root)) {
                                spin_unlock(&clone_root->root_item_lock);
+                               srcu_read_unlock(&fs_info->subvol_srcu, index);
                                ret = -EPERM;
                                goto out;
                        }
                        spin_unlock(&clone_root->root_item_lock);
+                       srcu_read_unlock(&fs_info->subvol_srcu, index);
+
                        sctx->clone_roots[i].root = clone_root;
                }
                vfree(clone_sources_tmp);
@@ -4901,19 +5631,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                key.objectid = arg->parent_root;
                key.type = BTRFS_ROOT_ITEM_KEY;
                key.offset = (u64)-1;
+
+               index = srcu_read_lock(&fs_info->subvol_srcu);
+
                sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
                if (IS_ERR(sctx->parent_root)) {
+                       srcu_read_unlock(&fs_info->subvol_srcu, index);
                        ret = PTR_ERR(sctx->parent_root);
                        goto out;
                }
+
                spin_lock(&sctx->parent_root->root_item_lock);
                sctx->parent_root->send_in_progress++;
                if (!btrfs_root_readonly(sctx->parent_root)) {
                        spin_unlock(&sctx->parent_root->root_item_lock);
+                       srcu_read_unlock(&fs_info->subvol_srcu, index);
                        ret = -EPERM;
                        goto out;
                }
                spin_unlock(&sctx->parent_root->root_item_lock);
+
+               srcu_read_unlock(&fs_info->subvol_srcu, index);
        }
 
        /*
@@ -4927,6 +5665,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        sort(sctx->clone_roots, sctx->clone_roots_cnt,
                        sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
                        NULL);
+       sort_clone_roots = 1;
 
        ret = send_subvol(sctx);
        if (ret < 0)
@@ -4942,24 +5681,57 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        }
 
 out:
-       for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
-               struct btrfs_root *r = sctx->clone_roots[i].root;
+       WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+       while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+               struct rb_node *n;
+               struct pending_dir_move *pm;
+
+               n = rb_first(&sctx->pending_dir_moves);
+               pm = rb_entry(n, struct pending_dir_move, node);
+               while (!list_empty(&pm->list)) {
+                       struct pending_dir_move *pm2;
+
+                       pm2 = list_first_entry(&pm->list,
+                                              struct pending_dir_move, list);
+                       free_pending_move(sctx, pm2);
+               }
+               free_pending_move(sctx, pm);
+       }
 
-               spin_lock(&r->root_item_lock);
-               r->send_in_progress--;
-               spin_unlock(&r->root_item_lock);
+       WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+       while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+               struct rb_node *n;
+               struct waiting_dir_move *dm;
+
+               n = rb_first(&sctx->waiting_dir_moves);
+               dm = rb_entry(n, struct waiting_dir_move, node);
+               rb_erase(&dm->node, &sctx->waiting_dir_moves);
+               kfree(dm);
        }
-       if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
-               struct btrfs_root *r = sctx->parent_root;
 
-               spin_lock(&r->root_item_lock);
-               r->send_in_progress--;
-               spin_unlock(&r->root_item_lock);
+       WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+       while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+               struct rb_node *n;
+               struct orphan_dir_info *odi;
+
+               n = rb_first(&sctx->orphan_dirs);
+               odi = rb_entry(n, struct orphan_dir_info, node);
+               free_orphan_dir_info(sctx, odi);
        }
 
-       spin_lock(&send_root->root_item_lock);
-       send_root->send_in_progress--;
-       spin_unlock(&send_root->root_item_lock);
+       if (sort_clone_roots) {
+               for (i = 0; i < sctx->clone_roots_cnt; i++)
+                       btrfs_root_dec_send_in_progress(
+                                       sctx->clone_roots[i].root);
+       } else {
+               for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+                       btrfs_root_dec_send_in_progress(
+                                       sctx->clone_roots[i].root);
+
+               btrfs_root_dec_send_in_progress(send_root);
+       }
+       if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+               btrfs_root_dec_send_in_progress(sctx->parent_root);
 
        kfree(arg);
        vfree(clone_sources_tmp);