btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
[cascardo/linux.git] / fs / btrfs / inode.c
index 03708ef..b1106d0 100644 (file)
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 
                key.objectid = btrfs_ino(inode);
                key.offset = start;
-               btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+               key.type = BTRFS_EXTENT_DATA_KEY;
 
                datasize = btrfs_file_extent_calc_inline_size(cur_size);
                path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                data_len = compressed_size;
 
        if (start > 0 ||
-           actual_end >= PAGE_CACHE_SIZE ||
-           data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+           actual_end > PAGE_CACHE_SIZE ||
+           data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
            (!compressed_size &&
            (actual_end & (root->sectorsize - 1)) == 0) ||
            end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
        return 0;
 }
 
+static inline int inode_need_compress(struct inode *inode)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       /* force compress */
+       if (btrfs_test_opt(root, FORCE_COMPRESS))
+               return 1;
+       /* bad compression ratios */
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+               return 0;
+       if (btrfs_test_opt(root, COMPRESS) ||
+           BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+           BTRFS_I(inode)->force_compress)
+               return 1;
+       return 0;
+}
+
 /*
  * we create compressed extents in two phases.  The first
  * phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
         * inode has not been flagged as nocompress.  This flag can
         * change at any time if we discover bad compression ratios.
         */
-       if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-           (btrfs_test_opt(root, COMPRESS) ||
-            (BTRFS_I(inode)->force_compress) ||
-            (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
+       if (inode_need_compress(inode)) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
                if (!pages) {
@@ -778,8 +792,12 @@ retry:
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-               if (ret)
+               if (ret) {
+                       btrfs_drop_extent_cache(inode, async_extent->start,
+                                               async_extent->start +
+                                               async_extent->ram_size - 1, 0);
                        goto out_free_reserve;
+               }
 
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -971,14 +989,14 @@ static noinline int cow_file_range(struct inode *inode,
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
                                               ram_size, cur_alloc_size, 0);
                if (ret)
-                       goto out_reserve;
+                       goto out_drop_extent_cache;
 
                if (root->root_key.objectid ==
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, start,
                                                      cur_alloc_size);
                        if (ret)
-                               goto out_reserve;
+                               goto out_drop_extent_cache;
                }
 
                if (disk_num_bytes < cur_alloc_size)
@@ -1006,6 +1024,8 @@ static noinline int cow_file_range(struct inode *inode,
 out:
        return ret;
 
+out_drop_extent_cache:
+       btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
 out_reserve:
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
@@ -1088,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->locked_page = locked_page;
                async_cow->start = start;
 
-               if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+               if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+                   !btrfs_test_opt(root, FORCE_COMPRESS))
                        cur_end = end;
                else
                        cur_end = min(end, start + 512 * 1024 - 1);
@@ -1096,8 +1117,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
 
-               btrfs_init_work(&async_cow->work, async_cow_start,
-                               async_cow_submit, async_cow_free);
+               btrfs_init_work(&async_cow->work,
+                               btrfs_delalloc_helper,
+                               async_cow_start, async_cow_submit,
+                               async_cow_free);
 
                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                        PAGE_CACHE_SHIFT;
@@ -1437,6 +1460,26 @@ error:
        return ret;
 }
 
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+       if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+           !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+               return 0;
+
+       /*
+        * @defrag_bytes is a hint value, no spinlock held here,
+        * if is not zero, it means the file is defragging.
+        * Force cow if given extent needs to be defragged.
+        */
+       if (BTRFS_I(inode)->defrag_bytes &&
+           test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+                          EXTENT_DEFRAG, 0, NULL))
+               return 1;
+
+       return 0;
+}
+
 /*
  * extent_io.c call back to do delayed allocation processing
  */
@@ -1445,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                              unsigned long *nr_written)
 {
        int ret;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int force_cow = need_force_cow(inode, start, end);
 
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 1, nr_written);
-       } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+       } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
-       } else if (!btrfs_test_opt(root, COMPRESS) &&
-                  !(BTRFS_I(inode)->force_compress) &&
-                  !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
+       } else if (!inode_need_compress(inode)) {
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        } else {
@@ -1547,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                               struct extent_state *state, unsigned long *bits)
 {
 
+       if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+               WARN_ON(1);
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
@@ -1569,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
+               if (*bits & EXTENT_DEFRAG)
+                       BTRFS_I(inode)->defrag_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                                         &BTRFS_I(inode)->runtime_flags))
                        btrfs_add_delalloc_inodes(root, inode);
@@ -1583,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                                 struct extent_state *state,
                                 unsigned long *bits)
 {
+       u64 len = state->end + 1 - state->start;
+
+       spin_lock(&BTRFS_I(inode)->lock);
+       if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+               BTRFS_I(inode)->defrag_bytes -= len;
+       spin_unlock(&BTRFS_I(inode)->lock);
+
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
@@ -1590,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
         */
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
-               u64 len = state->end + 1 - state->start;
                bool do_list = !btrfs_is_free_space_inode(inode);
 
                if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -1881,7 +1932,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 
        SetPageChecked(page);
        page_cache_get(page);
-       btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+       btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+                       btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
        return -EBUSY;
@@ -2651,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
 
+       btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+                                    ordered_extent->file_offset +
+                                    ordered_extent->len - 1);
+
        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
                truncated = true;
                logical_len = ordered_extent->truncated_len;
@@ -2822,7 +2878,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered_extent = NULL;
-       struct btrfs_workqueue *workers;
+       struct btrfs_workqueue *wq;
+       btrfs_work_func_t func;
 
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
 
@@ -2831,17 +2888,55 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                            end - start + 1, uptodate))
                return 0;
 
-       btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
+       if (btrfs_is_free_space_inode(inode)) {
+               wq = root->fs_info->endio_freespace_worker;
+               func = btrfs_freespace_write_helper;
+       } else {
+               wq = root->fs_info->endio_write_workers;
+               func = btrfs_endio_write_helper;
+       }
 
-       if (btrfs_is_free_space_inode(inode))
-               workers = root->fs_info->endio_freespace_worker;
-       else
-               workers = root->fs_info->endio_write_workers;
-       btrfs_queue_work(workers, &ordered_extent->work);
+       btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
+                       NULL);
+       btrfs_queue_work(wq, &ordered_extent->work);
 
        return 0;
 }
 
+static int __readpage_endio_check(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio,
+                                 int icsum, struct page *page,
+                                 int pgoff, u64 start, size_t len)
+{
+       char *kaddr;
+       u32 csum_expected;
+       u32 csum = ~(u32)0;
+       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
+
+       csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+       kaddr = kmap_atomic(page);
+       csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
+       btrfs_csum_final(csum, (char *)&csum);
+       if (csum != csum_expected)
+               goto zeroit;
+
+       kunmap_atomic(kaddr);
+       return 0;
+zeroit:
+       if (__ratelimit(&_rs))
+               btrfs_info(BTRFS_I(inode)->root->fs_info,
+                          "csum failed ino %llu off %llu csum %u expected csum %u",
+                          btrfs_ino(inode), start, csum, csum_expected);
+       memset(kaddr + pgoff, 1, len);
+       flush_dcache_page(page);
+       kunmap_atomic(kaddr);
+       if (csum_expected == 0)
+               return 0;
+       return -EIO;
+}
+
 /*
  * when reads are done, we need to check csums to verify the data is correct
  * if there's a match, we allow the bio to finish.  If not, the code in
@@ -2854,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        size_t offset = start - page_offset(page);
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       char *kaddr;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       u32 csum_expected;
-       u32 csum = ~(u32)0;
-       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                     DEFAULT_RATELIMIT_BURST);
 
        if (PageChecked(page)) {
                ClearPageChecked(page);
-               goto good;
+               return 0;
        }
 
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-               goto good;
+               return 0;
 
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2877,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        }
 
        phy_offset >>= inode->i_sb->s_blocksize_bits;
-       csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
-
-       kaddr = kmap_atomic(page);
-       csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
-       btrfs_csum_final(csum, (char *)&csum);
-       if (csum != csum_expected)
-               goto zeroit;
-
-       kunmap_atomic(kaddr);
-good:
-       return 0;
-
-zeroit:
-       if (__ratelimit(&_rs))
-               btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
-                       btrfs_ino(page->mapping->host), start, csum, csum_expected);
-       memset(kaddr + offset, 1, end - start + 1);
-       flush_dcache_page(page);
-       kunmap_atomic(kaddr);
-       if (csum_expected == 0)
-               return 0;
-       return -EIO;
+       return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+                                     start, (size_t)(end - start + 1));
 }
 
 struct delayed_iput {
@@ -3145,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
        path->reada = -1;
 
        key.objectid = BTRFS_ORPHAN_OBJECTID;
-       btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = (u64)-1;
 
        while (1) {
@@ -3172,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                /* make sure the item matches what we want */
                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
                        break;
-               if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+               if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;
 
                /* release the path since we're done with it */
@@ -4071,7 +4141,7 @@ search_again:
                fi = NULL;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-               found_type = btrfs_key_type(&found_key);
+               found_type = found_key.type;
 
                if (found_key.objectid != ino)
                        break;
@@ -4234,7 +4304,8 @@ out:
                        btrfs_abort_transaction(trans, root, ret);
        }
 error:
-       if (last_size != (u64)-1)
+       if (last_size != (u64)-1 &&
+           root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ordered_update_i_size(inode, last_size, NULL);
        btrfs_free_path(path);
        return err;
@@ -4674,6 +4745,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                remove_extent_mapping(map_tree, em);
                free_extent_map(em);
+               if (need_resched()) {
+                       write_unlock(&map_tree->lock);
+                       cond_resched();
+                       write_lock(&map_tree->lock);
+               }
        }
        write_unlock(&map_tree->lock);
 
@@ -4696,6 +4772,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
                                 &cached_state, GFP_NOFS);
                free_extent_state(state);
 
+               cond_resched();
                spin_lock(&io_tree->lock);
        }
        spin_unlock(&io_tree->lock);
@@ -4726,6 +4803,8 @@ void btrfs_evict_inode(struct inode *inode)
        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
+       btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
        if (root->fs_info->log_root_recovering) {
                BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                                 &BTRFS_I(inode)->runtime_flags));
@@ -5181,6 +5260,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        iput(inode);
                        inode = ERR_PTR(ret);
                }
+               /*
+                * If orphan cleanup did remove any orphans, it means the tree
+                * was modified and therefore the commit root is not the same as
+                * the current root anymore. This is a problem, because send
+                * uses the commit root and therefore can see inode items that
+                * don't exist in the current root anymore, and for example make
+                * calls to btrfs_iget, which will do tree lookups based on the
+                * current root and not on the commit root. Those lookups will
+                * fail, returning a -ESTALE error, and making send fail with
+                * that error. So make sure a send does not see any orphans we
+                * have just removed, and that it will see the same inodes
+                * regardless of whether a transaction commit happened before
+                * it started (meaning that the commit root will be the same as
+                * the current root) or not.
+                */
+               if (sub_root->node != sub_root->commit_root) {
+                       u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
+
+                       if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
+                               struct extent_buffer *eb;
+
+                               /*
+                                * Assert we can't have races between dentry
+                                * lookup called through the snapshot creation
+                                * ioctl and the VFS.
+                                */
+                               ASSERT(mutex_is_locked(&dir->i_mutex));
+
+                               down_write(&root->fs_info->commit_root_sem);
+                               eb = sub_root->commit_root;
+                               sub_root->commit_root =
+                                       btrfs_root_node(sub_root);
+                               up_write(&root->fs_info->commit_root_sem);
+                               free_extent_buffer(eb);
+                       }
+               }
        }
 
        return inode;
@@ -5274,7 +5389,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                btrfs_get_delayed_items(inode, &ins_list, &del_list);
        }
 
-       btrfs_set_key_type(&key, key_type);
+       key.type = key_type;
        key.offset = ctx->pos;
        key.objectid = btrfs_ino(inode);
 
@@ -5299,7 +5414,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 
                if (found_key.objectid != key.objectid)
                        break;
-               if (btrfs_key_type(&found_key) != key_type)
+               if (found_key.type != key_type)
                        break;
                if (found_key.offset < ctx->pos)
                        goto next;
@@ -5511,7 +5626,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        int ret;
 
        key.objectid = btrfs_ino(inode);
-       btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+       key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = (u64)-1;
 
        path = btrfs_alloc_path();
@@ -5543,7 +5658,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
        if (found_key.objectid != btrfs_ino(inode) ||
-           btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+           found_key.type != BTRFS_DIR_INDEX_KEY) {
                BTRFS_I(inode)->index_cnt = 2;
                goto out;
        }
@@ -5577,6 +5692,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
        return ret;
 }
 
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+       struct btrfs_iget_args args;
+       args.location = &BTRFS_I(inode)->location;
+       args.root = BTRFS_I(inode)->root;
+
+       return insert_inode_locked4(inode,
+                  btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+                  btrfs_find_actor, &args);
+}
+
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *dir,
@@ -5605,6 +5731,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                return ERR_PTR(-ENOMEM);
        }
 
+       /*
+        * O_TMPFILE, set link count to 0, so that after this point,
+        * we fill in an inode item with the correct link count.
+        */
+       if (!name)
+               set_nlink(inode, 0);
+
        /*
         * we have to initialize this early, so we can reclaim the inode
         * number if we fail afterwards in this function.
@@ -5643,7 +5776,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 
        key[0].objectid = objectid;
-       btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+       key[0].type = BTRFS_INODE_ITEM_KEY;
        key[0].offset = 0;
 
        sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5656,16 +5789,25 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                 * add more hard links than can fit in the ref item.
                 */
                key[1].objectid = objectid;
-               btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+               key[1].type = BTRFS_INODE_REF_KEY;
                key[1].offset = ref_objectid;
 
                sizes[1] = name_len + sizeof(*ref);
        }
 
+       location = &BTRFS_I(inode)->location;
+       location->objectid = objectid;
+       location->offset = 0;
+       location->type = BTRFS_INODE_ITEM_KEY;
+
+       ret = btrfs_insert_inode_locked(inode);
+       if (ret < 0)
+               goto fail;
+
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
        if (ret != 0)
-               goto fail;
+               goto fail_unlock;
 
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
@@ -5688,11 +5830,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
 
-       location = &BTRFS_I(inode)->location;
-       location->objectid = objectid;
-       location->offset = 0;
-       btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-
        btrfs_inherit_iflags(inode, dir);
 
        if (S_ISREG(mode)) {
@@ -5703,7 +5840,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                BTRFS_INODE_NODATASUM;
        }
 
-       btrfs_insert_inode_hash(inode);
        inode_tree_add(inode);
 
        trace_btrfs_inode_new(inode);
@@ -5718,6 +5854,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                          btrfs_ino(inode), root->root_key.objectid, ret);
 
        return inode;
+
+fail_unlock:
+       unlock_new_inode(inode);
 fail:
        if (dir && name)
                BTRFS_I(dir)->index_cnt--;
@@ -5751,7 +5890,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
        } else {
                key.objectid = ino;
-               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+               key.type = BTRFS_INODE_ITEM_KEY;
                key.offset = 0;
        }
 
@@ -5852,28 +5991,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
 
-       err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-       if (err) {
-               drop_inode = 1;
-               goto out_unlock;
-       }
-
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
        * if the filesystem supports xattrs by looking at the
        * ops vector.
        */
-
        inode->i_op = &btrfs_special_inode_operations;
-       err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+       init_special_inode(inode, inode->i_mode, rdev);
+
+       err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-               drop_inode = 1;
-       else {
-               init_special_inode(inode, inode->i_mode, rdev);
+               goto out_unlock_inode;
+
+       err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+       if (err) {
+               goto out_unlock_inode;
+       } else {
                btrfs_update_inode(trans, root, inode);
+               unlock_new_inode(inode);
                d_instantiate(dentry, inode);
        }
+
 out_unlock:
        btrfs_end_transaction(trans, root);
        btrfs_balance_delayed_items(root);
@@ -5883,6 +6022,12 @@ out_unlock:
                iput(inode);
        }
        return err;
+
+out_unlock_inode:
+       drop_inode = 1;
+       unlock_new_inode(inode);
+       goto out_unlock;
+
 }
 
 static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5917,15 +6062,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
        drop_inode_on_err = 1;
-
-       err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-       if (err)
-               goto out_unlock;
-
-       err = btrfs_update_inode(trans, root, inode);
-       if (err)
-               goto out_unlock;
-
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -5934,14 +6070,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+       inode->i_mapping->a_ops = &btrfs_aops;
+       inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+
+       err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+       if (err)
+               goto out_unlock_inode;
+
+       err = btrfs_update_inode(trans, root, inode);
+       if (err)
+               goto out_unlock_inode;
 
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-               goto out_unlock;
+               goto out_unlock_inode;
 
-       inode->i_mapping->a_ops = &btrfs_aops;
-       inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+       unlock_new_inode(inode);
        d_instantiate(dentry, inode);
 
 out_unlock:
@@ -5953,6 +6098,11 @@ out_unlock:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+
+out_unlock_inode:
+       unlock_new_inode(inode);
+       goto out_unlock;
+
 }
 
 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6060,25 +6210,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        }
 
        drop_on_err = 1;
+       /* these must be set before we unlock the inode */
+       inode->i_op = &btrfs_dir_inode_operations;
+       inode->i_fop = &btrfs_dir_file_operations;
 
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-               goto out_fail;
-
-       inode->i_op = &btrfs_dir_inode_operations;
-       inode->i_fop = &btrfs_dir_file_operations;
+               goto out_fail_inode;
 
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, root, inode);
        if (err)
-               goto out_fail;
+               goto out_fail_inode;
 
        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
                             dentry->d_name.len, 0, index);
        if (err)
-               goto out_fail;
+               goto out_fail_inode;
 
        d_instantiate(dentry, inode);
+       /*
+        * mkdir is special.  We're unlocking after we call d_instantiate
+        * to avoid a race with nfsd calling d_instantiate.
+        */
+       unlock_new_inode(inode);
        drop_on_err = 0;
 
 out_fail:
@@ -6088,23 +6243,66 @@ out_fail:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+
+out_fail_inode:
+       unlock_new_inode(inode);
+       goto out_fail;
+}
+
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+       struct rb_node *next;
+
+       next = rb_next(&em->rb_node);
+       if (!next)
+               return NULL;
+       return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+       struct rb_node *prev;
+
+       prev = rb_prev(&em->rb_node);
+       if (!prev)
+               return NULL;
+       return container_of(prev, struct extent_map, rb_node);
 }
 
 /* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
  * and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
  */
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                struct extent_map *existing,
                                struct extent_map *em,
-                               u64 map_start, u64 map_len)
+                               u64 map_start)
 {
+       struct extent_map *prev;
+       struct extent_map *next;
+       u64 start;
+       u64 end;
        u64 start_diff;
 
        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
-       start_diff = map_start - em->start;
-       em->start = map_start;
-       em->len = map_len;
+
+       if (existing->start > map_start) {
+               next = existing;
+               prev = prev_extent_map(next);
+       } else {
+               prev = existing;
+               next = next_extent_map(prev);
+       }
+
+       start = prev ? extent_map_end(prev) : em->start;
+       start = max_t(u64, start, em->start);
+       end = next ? next->start : extent_map_end(em);
+       end = min_t(u64, end, extent_map_end(em));
+       start_diff = start - em->start;
+       em->start = start;
+       em->len = end - start;
        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                em->block_start += start_diff;
@@ -6232,7 +6430,7 @@ again:
                              struct btrfs_file_extent_item);
        /* are we inside the extent that was found? */
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-       found_type = btrfs_key_type(&found_key);
+       found_type = found_key.type;
        if (found_key.objectid != objectid ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
                /*
@@ -6275,6 +6473,8 @@ next:
                        goto not_found;
                if (start + len <= found_key.offset)
                        goto not_found;
+               if (start > found_key.offset)
+                       goto next;
                em->start = start;
                em->orig_start = start;
                em->len = found_key.offset - start;
@@ -6379,26 +6579,21 @@ insert:
 
                ret = 0;
 
-               existing = lookup_extent_mapping(em_tree, start, len);
-               if (existing && (existing->start > start ||
-                   existing->start + existing->len <= start)) {
+               existing = search_extent_mapping(em_tree, start, len);
+               /*
+                * existing will always be non-NULL, since there must be
+                * extent causing the -EEXIST.
+                */
+               if (start >= extent_map_end(existing) ||
+                   start + len <= existing->start) {
+                       /*
+                        * The existing extent map is the one nearest to
+                        * the [start, start + len) range which overlaps
+                        */
+                       err = merge_extent_mapping(em_tree, existing,
+                                                  em, start);
                        free_extent_map(existing);
-                       existing = NULL;
-               }
-               if (!existing) {
-                       existing = lookup_extent_mapping(em_tree, em->start,
-                                                        em->len);
-                       if (existing) {
-                               err = merge_extent_mapping(em_tree, existing,
-                                                          em, start,
-                                                          root->sectorsize);
-                               free_extent_map(existing);
-                               if (err) {
-                                       free_extent_map(em);
-                                       em = NULL;
-                               }
-                       } else {
-                               err = -EIO;
+                       if (err) {
                                free_extent_map(em);
                                em = NULL;
                        }
@@ -7010,8 +7205,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                                       block_start, len,
                                                       orig_block_len,
                                                       ram_bytes, type);
-                               if (IS_ERR(em))
+                               if (IS_ERR(em)) {
+                                       ret = PTR_ERR(em);
                                        goto unlock_err;
+                               }
                        }
 
                        ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7086,45 +7283,277 @@ unlock_err:
        return ret;
 }
 
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+                                       int rw, int mirror_num)
 {
-       struct btrfs_dio_private *dip = bio->bi_private;
-       struct bio_vec *bvec;
-       struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct bio *dio_bio;
-       u32 *csums = (u32 *)dip->csum;
+       int ret;
+
+       BUG_ON(rw & REQ_WRITE);
+
+       bio_get(bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                 BTRFS_WQ_ENDIO_DIO_REPAIR);
+       if (ret)
+               goto err;
+
+       ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+       bio_put(bio);
+       return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+                                     struct bio *failed_bio,
+                                     struct io_failure_record *failrec,
+                                     int failed_mirror)
+{
+       int num_copies;
+
+       num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                     failrec->logical, failrec->len);
+       if (num_copies == 1) {
+               /*
+                * we only have a single copy of the data, so don't bother with
+                * all the retry and error correction code that follows. no
+                * matter what the error is, it is very likely to persist.
+                */
+               pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       failrec->failed_mirror = failed_mirror;
+       failrec->this_mirror++;
+       if (failrec->this_mirror == failed_mirror)
+               failrec->this_mirror++;
+
+       if (failrec->this_mirror > num_copies) {
+               pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         int failed_mirror, bio_end_io_t *repair_endio,
+                         void *repair_arg)
+{
+       struct io_failure_record *failrec;
+       struct bio *bio;
+       int isector;
+       int read_mode;
+       int ret;
+
+       BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+       ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+       if (ret)
+               return ret;
+
+       ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+                                        failed_mirror);
+       if (!ret) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       if (failed_bio->bi_vcnt > 1)
+               read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+       else
+               read_mode = READ_SYNC;
+
+       isector = start - btrfs_io_bio(failed_bio)->logical;
+       isector >>= inode->i_sb->s_blocksize_bits;
+       bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+                                     0, isector, repair_endio, repair_arg);
+       if (!bio) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       btrfs_debug(BTRFS_I(inode)->root->fs_info,
+                   "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+                   read_mode, failrec->this_mirror, failrec->in_validation);
+
+       ret = submit_dio_repair_bio(inode, bio, read_mode,
+                                   failrec->this_mirror);
+       if (ret) {
+               free_io_failure(inode, failrec);
+               bio_put(bio);
+       }
+
+       return ret;
+}
+
+struct btrfs_retry_complete {
+       struct completion done;
+       struct inode *inode;
        u64 start;
+       int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct bio_vec *bvec;
        int i;
 
-       start = dip->logical_offset;
+       if (err)
+               goto end;
+
+       done->uptodate = 1;
+       bio_for_each_segment_all(bvec, bio, i)
+               clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+                                      struct btrfs_io_bio *io_bio)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       int i;
+       int ret;
+
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio_nocsum, &done);
+               if (ret)
+                       return ret;
+
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+
+               start += bvec->bv_len;
+       }
+
+       return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct bio_vec *bvec;
+       int uptodate;
+       int ret;
+       int i;
+
+       if (err)
+               goto end;
+
+       uptodate = 1;
        bio_for_each_segment_all(bvec, bio, i) {
-               if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-                       struct page *page = bvec->bv_page;
-                       char *kaddr;
-                       u32 csum = ~(u32)0;
-                       unsigned long flags;
-
-                       local_irq_save(flags);
-                       kaddr = kmap_atomic(page);
-                       csum = btrfs_csum_data(kaddr + bvec->bv_offset,
-                                              csum, bvec->bv_len);
-                       btrfs_csum_final(csum, (char *)&csum);
-                       kunmap_atomic(kaddr);
-                       local_irq_restore(flags);
-
-                       flush_dcache_page(bvec->bv_page);
-                       if (csum != csums[i]) {
-                               btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
-                                         btrfs_ino(inode), start, csum,
-                                         csums[i]);
-                               err = -EIO;
-                       }
+               ret = __readpage_endio_check(done->inode, io_bio, i,
+                                            bvec->bv_page, 0,
+                                            done->start, bvec->bv_len);
+               if (!ret)
+                       clean_io_failure(done->inode, done->start,
+                                        bvec->bv_page, 0);
+               else
+                       uptodate = 0;
+       }
+
+       done->uptodate = uptodate;
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+                                   struct btrfs_io_bio *io_bio, int err)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       u64 offset = 0;
+       int i;
+       int ret;
+
+       err = 0;
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+               ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+                                            0, start, bvec->bv_len);
+               if (likely(!ret))
+                       goto next;
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio, &done);
+               if (ret) {
+                       err = ret;
+                       goto next;
                }
 
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+next:
+               offset += bvec->bv_len;
                start += bvec->bv_len;
        }
 
+       return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio, int err)
+{
+       bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+       if (skip_csum) {
+               if (unlikely(err))
+                       return __btrfs_correct_data_nocsum(inode, io_bio);
+               else
+                       return 0;
+       } else {
+               return __btrfs_subio_endio_read(inode, io_bio, err);
+       }
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct bio *dio_bio;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+       if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+               err = btrfs_subio_endio_read(inode, io_bio, err);
+
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
        dio_bio = dip->dio_bio;
@@ -7135,6 +7564,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        if (err)
                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
        dio_end_io(dio_bio, err);
+
+       if (io_bio->end_io)
+               io_bio->end_io(io_bio, err);
        bio_put(bio);
 }
 
@@ -7158,7 +7590,8 @@ again:
        if (!ret)
                goto out_test;
 
-       btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+       btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+                       finish_ordered_fn, NULL, NULL);
        btrfs_queue_work(root->fs_info->endio_write_workers,
                         &ordered->work);
 out_test:
@@ -7199,12 +7632,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
 
+       if (err)
+               btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+                          btrfs_ino(dip->inode), bio->bi_rw,
+                          (unsigned long long)bio->bi_iter.bi_sector,
+                          bio->bi_iter.bi_size, err);
+
+       if (dip->subio_endio)
+               err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
        if (err) {
-               btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-                         "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
-                     btrfs_ino(dip->inode), bio->bi_rw,
-                     (unsigned long long)bio->bi_iter.bi_sector,
-                     bio->bi_iter.bi_size, err);
                dip->errors = 1;
 
                /*
@@ -7235,6 +7673,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
 }
 
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+                                                struct inode *inode,
+                                                struct btrfs_dio_private *dip,
+                                                struct bio *bio,
+                                                u64 file_offset)
+{
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+       int ret;
+
+       /*
+        * We load all the csum data we need when we submit
+        * the first bio to reduce the csum tree search and
+        * contention.
+        */
+       if (dip->logical_offset == file_offset) {
+               ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+                                               file_offset);
+               if (ret)
+                       return ret;
+       }
+
+       if (bio == dip->orig_bio)
+               return 0;
+
+       file_offset -= dip->logical_offset;
+       file_offset >>= inode->i_sb->s_blocksize_bits;
+       io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+       return 0;
+}
+
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
                                         int async_submit)
@@ -7273,13 +7743,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
                if (ret)
                        goto err;
-       } else if (!skip_sum) {
-               ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
-                                               file_offset);
+       } else {
+               ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+                                                    file_offset);
                if (ret)
                        goto err;
        }
-
 map:
        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
 err:
@@ -7300,19 +7769,18 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        u64 submit_len = 0;
        u64 map_length;
        int nr_pages = 0;
-       int ret = 0;
+       int ret;
        int async_submit = 0;
 
        map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
-       if (ret) {
-               bio_put(orig_bio);
+       if (ret)
                return -EIO;
-       }
 
        if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
+               dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
                goto submit;
        }
 
@@ -7326,8 +7794,10 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
        if (!bio)
                return -ENOMEM;
+
        bio->bi_private = dip;
        bio->bi_end_io = btrfs_end_dio_bio;
+       btrfs_io_bio(bio)->logical = file_offset;
        atomic_inc(&dip->pending_bios);
 
        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
@@ -7362,6 +7832,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                                goto out_err;
                        bio->bi_private = dip;
                        bio->bi_end_io = btrfs_end_dio_bio;
+                       btrfs_io_bio(bio)->logical = file_offset;
 
                        map_length = orig_bio->bi_iter.bi_size;
                        ret = btrfs_map_block(root->fs_info, rw,
@@ -7405,11 +7876,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
        struct bio *io_bio;
+       struct btrfs_io_bio *btrfs_bio;
        int skip_sum;
-       int sum_len;
        int write = rw & REQ_WRITE;
        int ret = 0;
-       u16 csum_size;
 
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
@@ -7419,16 +7889,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
                goto free_ordered;
        }
 
-       if (!skip_sum && !write) {
-               csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-               sum_len = dio_bio->bi_iter.bi_size >>
-                       inode->i_sb->s_blocksize_bits;
-               sum_len *= csum_size;
-       } else {
-               sum_len = 0;
-       }
-
-       dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+       dip = kzalloc(sizeof(*dip), GFP_NOFS);
        if (!dip) {
                ret = -ENOMEM;
                goto free_io_bio;
@@ -7440,20 +7901,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        dip->bytes = dio_bio->bi_iter.bi_size;
        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
        io_bio->bi_private = dip;
-       dip->errors = 0;
        dip->orig_bio = io_bio;
        dip->dio_bio = dio_bio;
        atomic_set(&dip->pending_bios, 0);
+       btrfs_bio = btrfs_io_bio(io_bio);
+       btrfs_bio->logical = file_offset;
 
-       if (write)
+       if (write) {
                io_bio->bi_end_io = btrfs_endio_direct_write;
-       else
+       } else {
                io_bio->bi_end_io = btrfs_endio_direct_read;
+               dip->subio_endio = btrfs_subio_endio_read;
+       }
 
        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
        if (!ret)
                return;
 
+       if (btrfs_bio->end_io)
+               btrfs_bio->end_io(btrfs_bio, ret);
 free_io_bio:
        bio_put(io_bio);
 
@@ -7534,7 +8000,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        count = iov_iter_count(iter);
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                     &BTRFS_I(inode)->runtime_flags))
-               filemap_fdatawrite_range(inode->i_mapping, offset, count);
+               filemap_fdatawrite_range(inode->i_mapping, offset,
+                                        offset + count - 1);
 
        if (rw & WRITE) {
                /*
@@ -8041,6 +8508,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 
        set_nlink(inode, 1);
        btrfs_i_size_write(inode, 0);
+       unlock_new_inode(inode);
 
        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
        if (err)
@@ -8069,6 +8537,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->delalloc_bytes = 0;
+       ei->defrag_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
        ei->csum_bytes = 0;
@@ -8127,6 +8596,7 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(BTRFS_I(inode)->reserved_extents);
        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
        WARN_ON(BTRFS_I(inode)->csum_bytes);
+       WARN_ON(BTRFS_I(inode)->defrag_bytes);
 
        /*
         * This can happen where we create an inode, but somebody else also
@@ -8495,7 +8965,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
        work->inode = inode;
        work->wait = wait;
        work->delay_iput = delay_iput;
-       btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+       WARN_ON_ONCE(!inode);
+       btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+                       btrfs_run_delalloc_work, NULL, NULL);
 
        return work;
 }
@@ -8699,12 +9171,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
 
-       err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-       if (err) {
-               drop_inode = 1;
-               goto out_unlock;
-       }
-
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -8713,34 +9179,32 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+       inode->i_mapping->a_ops = &btrfs_aops;
+       inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+       err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+       if (err)
+               goto out_unlock_inode;
 
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-               drop_inode = 1;
-       else {
-               inode->i_mapping->a_ops = &btrfs_aops;
-               inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-               BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-       }
-       if (drop_inode)
-               goto out_unlock;
+               goto out_unlock_inode;
 
        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
-               drop_inode = 1;
-               goto out_unlock;
+               goto out_unlock_inode;
        }
        key.objectid = btrfs_ino(inode);
        key.offset = 0;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+       key.type = BTRFS_EXTENT_DATA_KEY;
        datasize = btrfs_file_extent_calc_inline_size(name_len);
        err = btrfs_insert_empty_item(trans, root, path, &key,
                                      datasize);
        if (err) {
-               drop_inode = 1;
                btrfs_free_path(path);
-               goto out_unlock;
+               goto out_unlock_inode;
        }
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8764,12 +9228,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
-       if (err)
+       if (err) {
                drop_inode = 1;
+               goto out_unlock_inode;
+       }
+
+       unlock_new_inode(inode);
+       d_instantiate(dentry, inode);
 
 out_unlock:
-       if (!err)
-               d_instantiate(dentry, inode);
        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -8777,6 +9244,11 @@ out_unlock:
        }
        btrfs_btree_balance_dirty(root);
        return err;
+
+out_unlock_inode:
+       drop_inode = 1;
+       unlock_new_inode(inode);
+       goto out_unlock;
 }
 
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8960,14 +9432,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                goto out;
        }
 
-       ret = btrfs_init_inode_security(trans, inode, dir, NULL);
-       if (ret)
-               goto out;
-
-       ret = btrfs_update_inode(trans, root, inode);
-       if (ret)
-               goto out;
-
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
 
@@ -8975,10 +9439,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
+       ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+       if (ret)
+               goto out_inode;
+
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret)
+               goto out_inode;
        ret = btrfs_orphan_add(trans, inode);
        if (ret)
-               goto out;
+               goto out_inode;
 
+       /*
+        * We set number of links to 0 in btrfs_new_inode(), and here we set
+        * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+        * through:
+        *
+        *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+        */
+       set_nlink(inode, 1);
+       unlock_new_inode(inode);
        d_tmpfile(dentry, inode);
        mark_inode_dirty(inode);
 
@@ -8988,8 +9468,12 @@ out:
                iput(inode);
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
-
        return ret;
+
+out_inode:
+       unlock_new_inode(inode);
+       goto out;
+
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {