btrfs: make btrfs_issue_discard return bytes discarded
[cascardo/linux.git] / fs / btrfs / extent-tree.c
index 8b353ad..16655bb 100644 (file)
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                              u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
-                               u64 bytenr, u64 num_bytes, u64 parent,
+                               struct btrfs_delayed_ref_node *node, u64 parent,
                                u64 root_objectid, u64 owner_objectid,
                                u64 owner_offset, int refs_to_drop,
-                               struct btrfs_delayed_extent_op *extra_op,
-                               int no_quota);
+                               struct btrfs_delayed_extent_op *extra_op);
 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
                                    struct extent_buffer *leaf,
                                    struct btrfs_extent_item *ei);
@@ -1884,10 +1883,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static int btrfs_issue_discard(struct block_device *bdev,
-                               u64 start, u64 len)
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+                              u64 *discarded_bytes)
 {
-       return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+       int ret = 0;
+
+       *discarded_bytes = 0;
+       ret = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+       if (!ret)
+               *discarded_bytes = len;
+
+       return ret;
 }
 
 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1908,14 +1914,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
 
                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+                       u64 bytes;
                        if (!stripe->dev->can_discard)
                                continue;
 
                        ret = btrfs_issue_discard(stripe->dev->bdev,
                                                  stripe->physical,
-                                                 stripe->length);
+                                                 stripe->length,
+                                                 &bytes);
                        if (!ret)
-                               discarded_bytes += stripe->length;
+                               discarded_bytes += bytes;
                        else if (ret != -EOPNOTSUPP)
                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 
@@ -1967,10 +1975,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
-                                 u64 bytenr, u64 num_bytes,
+                                 struct btrfs_delayed_ref_node *node,
                                  u64 parent, u64 root_objectid,
                                  u64 owner, u64 offset, int refs_to_add,
-                                 int no_quota,
                                  struct btrfs_delayed_extent_op *extent_op)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,9 +1985,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        struct btrfs_extent_item *item;
        struct btrfs_key key;
+       u64 bytenr = node->bytenr;
+       u64 num_bytes = node->num_bytes;
        u64 refs;
        int ret;
-       enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+       int no_quota = node->no_quota;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -1996,26 +2005,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                           bytenr, num_bytes, parent,
                                           root_objectid, owner, offset,
                                           refs_to_add, extent_op);
-       if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
-               goto out;
-       /*
-        * Ok we were able to insert an inline extent and it appears to be a new
-        * reference, deal with the qgroup accounting.
-        */
-       if (!ret && !no_quota) {
-               ASSERT(root->fs_info->quota_enabled);
-               leaf = path->nodes[0];
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               item = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_item);
-               if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
-                       type = BTRFS_QGROUP_OPER_ADD_SHARED;
-               btrfs_release_path(path);
-
-               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                             bytenr, num_bytes, type, 0);
+       if ((ret < 0 && ret != -EAGAIN) || !ret)
                goto out;
-       }
 
        /*
         * Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2026,8 +2017,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, item);
-       if (refs)
-               type = BTRFS_QGROUP_OPER_ADD_SHARED;
        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
        if (extent_op)
                __run_delayed_extent_op(extent_op, leaf, item);
@@ -2035,13 +2024,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
 
-       if (!no_quota) {
-               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                             bytenr, num_bytes, type, 0);
-               if (ret)
-                       goto out;
-       }
-
        path->reada = 1;
        path->leave_spinning = 1;
        /* now insert the actual backref */
@@ -2087,17 +2069,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
                                                 ref->objectid, ref->offset,
                                                 &ins, node->ref_mod);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
-               ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-                                            node->num_bytes, parent,
+               ret = __btrfs_inc_extent_ref(trans, root, node, parent,
                                             ref_root, ref->objectid,
                                             ref->offset, node->ref_mod,
-                                            node->no_quota, extent_op);
+                                            extent_op);
        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
-               ret = __btrfs_free_extent(trans, root, node->bytenr,
-                                         node->num_bytes, parent,
+               ret = __btrfs_free_extent(trans, root, node, parent,
                                          ref_root, ref->objectid,
                                          ref->offset, node->ref_mod,
-                                         extent_op, node->no_quota);
+                                         extent_op);
        } else {
                BUG();
        }
@@ -2255,15 +2235,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                ref->level, &ins,
                                                node->no_quota);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
-               ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-                                            node->num_bytes, parent, ref_root,
-                                            ref->level, 0, 1, node->no_quota,
+               ret = __btrfs_inc_extent_ref(trans, root, node,
+                                            parent, ref_root,
+                                            ref->level, 0, 1,
                                             extent_op);
        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
-               ret = __btrfs_free_extent(trans, root, node->bytenr,
-                                         node->num_bytes, parent, ref_root,
-                                         ref->level, 0, 1, extent_op,
-                                         node->no_quota);
+               ret = __btrfs_free_extent(trans, root, node,
+                                         parent, ref_root,
+                                         ref->level, 0, 1, extent_op);
        } else {
                BUG();
        }
@@ -2323,28 +2302,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
-       struct rb_node *node;
-       struct btrfs_delayed_ref_node *ref, *last = NULL;;
+       struct btrfs_delayed_ref_node *ref;
+
+       if (list_empty(&head->ref_list))
+               return NULL;
 
        /*
-        * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
-        * this prevents ref count from going down to zero when
-        * there still are pending delayed ref.
+        * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+        * This is to prevent a ref count from going down to zero, which deletes
+        * the extent item from the extent tree, when there still are references
+        * to add, which would fail because they would not find the extent item.
         */
-       node = rb_first(&head->ref_root);
-       while (node) {
-               ref = rb_entry(node, struct btrfs_delayed_ref_node,
-                               rb_node);
+       list_for_each_entry(ref, &head->ref_list, list) {
                if (ref->action == BTRFS_ADD_DELAYED_REF)
                        return ref;
-               else if (last == NULL)
-                       last = ref;
-               node = rb_next(node);
        }
-       return last;
+
+       return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+                         list);
 }
 
 /*
@@ -2396,16 +2374,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                        }
                }
 
-               /*
-                * We need to try and merge add/drops of the same ref since we
-                * can run into issues with relocate dropping the implicit ref
-                * and then it being added back again before the drop can
-                * finish.  If we merged anything we need to re-loop so we can
-                * get a good ref.
-                */
                spin_lock(&locked_ref->lock);
-               btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
-                                        locked_ref);
 
                /*
                 * locked_ref is the head node, so we have to go one
@@ -2482,7 +2451,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                        spin_unlock(&locked_ref->lock);
                        spin_lock(&delayed_refs->lock);
                        spin_lock(&locked_ref->lock);
-                       if (rb_first(&locked_ref->ref_root) ||
+                       if (!list_empty(&locked_ref->ref_list) ||
                            locked_ref->extent_op) {
                                spin_unlock(&locked_ref->lock);
                                spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2465,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                } else {
                        actual_count++;
                        ref->in_tree = 0;
-                       rb_erase(&ref->rb_node, &locked_ref->ref_root);
+                       list_del(&ref->list);
                }
                atomic_dec(&delayed_refs->num_entries);
 
@@ -2538,6 +2507,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 * list before we release it.
                 */
                if (btrfs_delayed_ref_is_head(ref)) {
+                       if (locked_ref->is_data &&
+                           locked_ref->total_ref_mod < 0) {
+                               spin_lock(&delayed_refs->lock);
+                               delayed_refs->pending_csums -= ref->num_bytes;
+                               spin_unlock(&delayed_refs->lock);
+                       }
                        btrfs_delayed_ref_unlock(locked_ref);
                        locked_ref = NULL;
                }
@@ -2561,8 +2536,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 */
                spin_lock(&delayed_refs->lock);
                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
-               avg = div64_u64(avg, 4);
-               fs_info->avg_delayed_ref_runtime = avg;
+               fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
                spin_unlock(&delayed_refs->lock);
        }
        return 0;
@@ -2624,7 +2598,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
         * We don't ever fill up leaves all the way so multiply by 2 just to be
         * closer to what we're really going to want to ouse.
         */
-       return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+       return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+       u64 csum_size;
+       u64 num_csums_per_leaf;
+       u64 num_csums;
+
+       csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+       num_csums_per_leaf = div64_u64(csum_size,
+                       (u64)btrfs_super_csum_size(root->fs_info->super_copy));
+       num_csums = div64_u64(csum_bytes, root->sectorsize);
+       num_csums += num_csums_per_leaf - 1;
+       num_csums = div64_u64(num_csums, num_csums_per_leaf);
+       return num_csums;
 }
 
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2632,7 +2625,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 {
        struct btrfs_block_rsv *global_rsv;
        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
-       u64 num_bytes;
+       u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+       u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+       u64 num_bytes, num_dirty_bgs_bytes;
        int ret = 0;
 
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2640,17 +2635,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
        if (num_heads > 1)
                num_bytes += (num_heads - 1) * root->nodesize;
        num_bytes <<= 1;
+       num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+       num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+                                                            num_dirty_bgs);
        global_rsv = &root->fs_info->global_block_rsv;
 
        /*
         * If we can't allocate any more chunks lets make sure we have _lots_ of
         * wiggle room since running delayed refs can create more delayed refs.
         */
-       if (global_rsv->space_info->full)
+       if (global_rsv->space_info->full) {
+               num_dirty_bgs_bytes <<= 1;
                num_bytes <<= 1;
+       }
 
        spin_lock(&global_rsv->lock);
-       if (global_rsv->reserved <= num_bytes)
+       if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
                ret = 1;
        spin_unlock(&global_rsv->lock);
        return ret;
@@ -2833,9 +2833,6 @@ again:
                goto again;
        }
 out:
-       ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
-       if (ret)
-               return ret;
        assert_qgroups_uptodate(trans);
        return 0;
 }
@@ -2874,7 +2871,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_data_ref *data_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
-       struct rb_node *node;
        int ret = 0;
 
        delayed_refs = &trans->transaction->delayed_refs;
@@ -2903,11 +2899,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
        spin_unlock(&delayed_refs->lock);
 
        spin_lock(&head->lock);
-       node = rb_first(&head->ref_root);
-       while (node) {
-               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-               node = rb_next(node);
-
+       list_for_each_entry(ref, &head->ref_list, list) {
                /* If it's a shared ref we know a cross reference exists */
                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
                        ret = 1;
@@ -3147,10 +3139,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
        btrfs_mark_buffer_dirty(leaf);
-       btrfs_release_path(path);
 fail:
-       if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+       btrfs_release_path(path);
        return ret;
 
 }
@@ -3193,7 +3183,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        struct inode *inode = NULL;
        u64 alloc_hint = 0;
        int dcs = BTRFS_DC_ERROR;
-       int num_pages = 0;
+       u64 num_pages = 0;
        int retries = 0;
        int ret = 0;
 
@@ -3267,15 +3257,14 @@ again:
                if (ret)
                        goto out_put;
 
-               ret = btrfs_truncate_free_space_cache(root, trans, inode);
+               ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
                if (ret)
                        goto out_put;
        }
 
        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED ||
-           !btrfs_test_opt(root, SPACE_CACHE) ||
-           block_group->delalloc_bytes) {
+           !btrfs_test_opt(root, SPACE_CACHE)) {
                /*
                 * don't bother trying to write stuff out _if_
                 * a) we're not cached,
@@ -3293,14 +3282,14 @@ again:
         * taking up quite a bit since it's not folded into the other space
         * cache.
         */
-       num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+       num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
        if (!num_pages)
                num_pages = 1;
 
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
 
-       ret = btrfs_check_data_free_space(inode, num_pages);
+       ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
        if (ret)
                goto out_put;
 
@@ -3351,16 +3340,188 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS.  This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO.  There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *cache;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       int ret = 0;
+       int should_put;
+       struct btrfs_path *path = NULL;
+       LIST_HEAD(dirty);
+       struct list_head *io = &cur_trans->io_bgs;
+       int num_started = 0;
+       int loops = 0;
+
+       spin_lock(&cur_trans->dirty_bgs_lock);
+       if (list_empty(&cur_trans->dirty_bgs)) {
+               spin_unlock(&cur_trans->dirty_bgs_lock);
+               return 0;
+       }
+       list_splice_init(&cur_trans->dirty_bgs, &dirty);
+       spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+       /*
+        * make sure all the block groups on our dirty list actually
+        * exist
+        */
+       btrfs_create_pending_block_groups(trans, root);
+
+       if (!path) {
+               path = btrfs_alloc_path();
+               if (!path)
+                       return -ENOMEM;
+       }
+
+       /*
+        * cache_write_mutex is here only to save us from balance or automatic
+        * removal of empty block groups deleting this block group while we are
+        * writing out the cache
+        */
+       mutex_lock(&trans->transaction->cache_write_mutex);
+       while (!list_empty(&dirty)) {
+               cache = list_first_entry(&dirty,
+                                        struct btrfs_block_group_cache,
+                                        dirty_list);
+               /*
+                * this can happen if something re-dirties a block
+                * group that is already under IO.  Just wait for it to
+                * finish and then do it all again
+                */
+               if (!list_empty(&cache->io_list)) {
+                       list_del_init(&cache->io_list);
+                       btrfs_wait_cache_io(root, trans, cache,
+                                           &cache->io_ctl, path,
+                                           cache->key.objectid);
+                       btrfs_put_block_group(cache);
+               }
+
+
+               /*
+                * btrfs_wait_cache_io uses the cache->dirty_list to decide
+                * if it should update the cache_state.  Don't delete
+                * until after we wait.
+                *
+                * Since we're not running in the commit critical section
+                * we need the dirty_bgs_lock to protect from update_block_group
+                */
+               spin_lock(&cur_trans->dirty_bgs_lock);
+               list_del_init(&cache->dirty_list);
+               spin_unlock(&cur_trans->dirty_bgs_lock);
+
+               should_put = 1;
+
+               cache_save_setup(cache, trans, path);
+
+               if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+                       cache->io_ctl.inode = NULL;
+                       ret = btrfs_write_out_cache(root, trans, cache, path);
+                       if (ret == 0 && cache->io_ctl.inode) {
+                               num_started++;
+                               should_put = 0;
+
+                               /*
+                                * the cache_write_mutex is protecting
+                                * the io_list
+                                */
+                               list_add_tail(&cache->io_list, io);
+                       } else {
+                               /*
+                                * if we failed to write the cache, the
+                                * generation will be bad and life goes on
+                                */
+                               ret = 0;
+                       }
+               }
+               if (!ret) {
+                       ret = write_one_cache_group(trans, root, path, cache);
+                       /*
+                        * Our block group might still be attached to the list
+                        * of new block groups in the transaction handle of some
+                        * other task (struct btrfs_trans_handle->new_bgs). This
+                        * means its block group item isn't yet in the extent
+                        * tree. If this happens ignore the error, as we will
+                        * try again later in the critical section of the
+                        * transaction commit.
+                        */
+                       if (ret == -ENOENT) {
+                               ret = 0;
+                               spin_lock(&cur_trans->dirty_bgs_lock);
+                               if (list_empty(&cache->dirty_list)) {
+                                       list_add_tail(&cache->dirty_list,
+                                                     &cur_trans->dirty_bgs);
+                                       btrfs_get_block_group(cache);
+                               }
+                               spin_unlock(&cur_trans->dirty_bgs_lock);
+                       } else if (ret) {
+                               btrfs_abort_transaction(trans, root, ret);
+                       }
+               }
+
+               /* if its not on the io list, we need to put the block group */
+               if (should_put)
+                       btrfs_put_block_group(cache);
+
+               if (ret)
+                       break;
+
+               /*
+                * Avoid blocking other tasks for too long. It might even save
+                * us from writing caches for block groups that are going to be
+                * removed.
+                */
+               mutex_unlock(&trans->transaction->cache_write_mutex);
+               mutex_lock(&trans->transaction->cache_write_mutex);
+       }
+       mutex_unlock(&trans->transaction->cache_write_mutex);
+
+       /*
+        * go through delayed refs for all the stuff we've just kicked off
+        * and then loop back (just once)
+        */
+       ret = btrfs_run_delayed_refs(trans, root, 0);
+       if (!ret && loops == 0) {
+               loops++;
+               spin_lock(&cur_trans->dirty_bgs_lock);
+               list_splice_init(&cur_trans->dirty_bgs, &dirty);
+               /*
+                * dirty_bgs_lock protects us from concurrent block group
+                * deletes too (not just cache_write_mutex).
+                */
+               if (!list_empty(&dirty)) {
+                       spin_unlock(&cur_trans->dirty_bgs_lock);
+                       goto again;
+               }
+               spin_unlock(&cur_trans->dirty_bgs_lock);
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
+       int should_put;
        struct btrfs_path *path;
-
-       if (list_empty(&cur_trans->dirty_bgs))
-               return 0;
+       struct list_head *io = &cur_trans->io_bgs;
+       int num_started = 0;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -3376,16 +3537,64 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group_cache,
                                         dirty_list);
+
+               /*
+                * this can happen if cache_save_setup re-dirties a block
+                * group that is already under IO.  Just wait for it to
+                * finish and then do it all again
+                */
+               if (!list_empty(&cache->io_list)) {
+                       list_del_init(&cache->io_list);
+                       btrfs_wait_cache_io(root, trans, cache,
+                                           &cache->io_ctl, path,
+                                           cache->key.objectid);
+                       btrfs_put_block_group(cache);
+               }
+
+               /*
+                * don't remove from the dirty list until after we've waited
+                * on any pending IO
+                */
                list_del_init(&cache->dirty_list);
-               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-                       cache_save_setup(cache, trans, path);
-               if (!ret)
-                       ret = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long) -1);
-               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
-                       btrfs_write_out_cache(root, trans, cache, path);
+               should_put = 1;
+
+               cache_save_setup(cache, trans, path);
+
                if (!ret)
+                       ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+                       cache->io_ctl.inode = NULL;
+                       ret = btrfs_write_out_cache(root, trans, cache, path);
+                       if (ret == 0 && cache->io_ctl.inode) {
+                               num_started++;
+                               should_put = 0;
+                               list_add_tail(&cache->io_list, io);
+                       } else {
+                               /*
+                                * if we failed to write the cache, the
+                                * generation will be bad and life goes on
+                                */
+                               ret = 0;
+                       }
+               }
+               if (!ret) {
                        ret = write_one_cache_group(trans, root, path, cache);
+                       if (ret)
+                               btrfs_abort_transaction(trans, root, ret);
+               }
+
+               /* if its not on the io list, we need to put the block group */
+               if (should_put)
+                       btrfs_put_block_group(cache);
+       }
+
+       while (!list_empty(io)) {
+               cache = list_first_entry(io, struct btrfs_block_group_cache,
+                                        io_list);
+               list_del_init(&cache->io_list);
+               btrfs_wait_cache_io(root, trans, cache,
+                                   &cache->io_ctl, path, cache->key.objectid);
                btrfs_put_block_group(cache);
        }
 
@@ -3445,7 +3654,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                found->disk_total += total_bytes * factor;
                found->bytes_used += bytes_used;
                found->disk_used += bytes_used * factor;
-               found->full = 0;
+               if (total_bytes > 0)
+                       found->full = 0;
                spin_unlock(&found->lock);
                *space_info = found;
                return 0;
@@ -3473,7 +3683,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
-       found->full = 0;
+       if (total_bytes > 0)
+               found->full = 0;
+       else
+               found->full = 1;
        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
        found->chunk_alloc = 0;
        found->flush = 0;
@@ -3635,19 +3848,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
  * This will check the space that the inode allocates from to make sure we have
  * enough space for bytes.
  */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
 {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 used;
-       int ret = 0, committed = 0, alloc_chunk = 1;
+       int ret = 0;
+       int need_commit = 2;
+       int have_pinned_space;
 
        /* make sure bytes are sectorsize aligned */
        bytes = ALIGN(bytes, root->sectorsize);
 
        if (btrfs_is_free_space_inode(inode)) {
-               committed = 1;
+               need_commit = 0;
                ASSERT(current->journal_info);
        }
 
@@ -3669,7 +3884,7 @@ again:
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
                 */
-               if (!data_sinfo->full && alloc_chunk) {
+               if (!data_sinfo->full) {
                        u64 alloc_target;
 
                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -3697,8 +3912,10 @@ alloc:
                        if (ret < 0) {
                                if (ret != -ENOSPC)
                                        return ret;
-                               else
+                               else {
+                                       have_pinned_space = 1;
                                        goto commit_trans;
+                               }
                        }
 
                        if (!data_sinfo)
@@ -3709,26 +3926,42 @@ alloc:
 
                /*
                 * If we don't have enough pinned space to deal with this
-                * allocation don't bother committing the transaction.
+                * allocation, and no removed chunk in current transaction,
+                * don't bother committing the transaction.
                 */
-               if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
-                                          bytes) < 0)
-                       committed = 1;
+               have_pinned_space = percpu_counter_compare(
+                       &data_sinfo->total_bytes_pinned,
+                       used + bytes - data_sinfo->total_bytes);
                spin_unlock(&data_sinfo->lock);
 
                /* commit the current transaction and try again */
 commit_trans:
-               if (!committed &&
+               if (need_commit &&
                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
-                       committed = 1;
+                       need_commit--;
+
+                       if (need_commit > 0)
+                               btrfs_wait_ordered_roots(fs_info, -1);
 
                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
-                       ret = btrfs_commit_transaction(trans, root);
-                       if (ret)
-                               return ret;
-                       goto again;
+                       if (have_pinned_space >= 0 ||
+                           trans->transaction->have_free_bgs ||
+                           need_commit > 0) {
+                               ret = btrfs_commit_transaction(trans, root);
+                               if (ret)
+                                       return ret;
+                               /*
+                                * make sure that all running delayed iput are
+                                * done
+                                */
+                               down_write(&root->fs_info->delayed_iput_sem);
+                               up_write(&root->fs_info->delayed_iput_sem);
+                               goto again;
+                       } else {
+                               btrfs_end_transaction(trans, root);
+                       }
                }
 
                trace_btrfs_space_reservation(root->fs_info,
@@ -3736,12 +3969,16 @@ commit_trans:
                                              data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
+       ret = btrfs_qgroup_reserve(root, write_bytes);
+       if (ret)
+               goto out;
        data_sinfo->bytes_may_use += bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 1);
+out:
        spin_unlock(&data_sinfo->lock);
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -3819,7 +4056,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
        return 1;
 }
 
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
 {
        u64 num_dev;
 
@@ -3833,24 +4070,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
        else
                num_dev = 1;    /* DUP or single */
 
-       /* metadata for updaing devices and chunk tree */
-       return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+       return num_dev;
 }
 
-static void check_system_chunk(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       u64 type)
 {
        struct btrfs_space_info *info;
        u64 left;
        u64 thresh;
+       int ret = 0;
+       u64 num_devs;
+
+       /*
+        * Needed because we can end up allocating a system chunk and for an
+        * atomic and race free space reservation in the chunk block reserve.
+        */
+       ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
 
        info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        spin_lock(&info->lock);
        left = info->total_bytes - info->bytes_used - info->bytes_pinned -
-               info->bytes_reserved - info->bytes_readonly;
+               info->bytes_reserved - info->bytes_readonly -
+               info->bytes_may_use;
        spin_unlock(&info->lock);
 
-       thresh = get_system_chunk_thresh(root, type);
+       num_devs = get_profile_num_devs(root, type);
+
+       /* num_devs device items to update and 1 chunk item to add or remove */
+       thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+               btrfs_calc_trans_metadata_size(root, 1);
+
        if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
                btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
                        left, thresh, type);
@@ -3861,7 +4117,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
                u64 flags;
 
                flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
-               btrfs_alloc_chunk(trans, root, flags);
+               /*
+                * Ignore failure to create system chunk. We might end up not
+                * needing it, as we might not need to COW all nodes/leafs from
+                * the paths we visit in the chunk tree (they were already COWed
+                * or created in the current transaction for example).
+                */
+               ret = btrfs_alloc_chunk(trans, root, flags);
+       }
+
+       if (!ret) {
+               ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+                                         &root->fs_info->chunk_block_rsv,
+                                         thresh, BTRFS_RESERVE_NO_FLUSH);
+               if (!ret)
+                       trans->chunk_bytes_reserved += thresh;
        }
 }
 
@@ -3966,6 +4236,24 @@ out:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
        mutex_unlock(&fs_info->chunk_mutex);
+       /*
+        * When we allocate a new chunk we reserve space in the chunk block
+        * reserve to make sure we can COW nodes/leafs in the chunk tree or
+        * add new nodes/leafs to it if we end up needing to do it when
+        * inserting the chunk item and updating device items as part of the
+        * second phase of chunk allocation, performed by
+        * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+        * large number of new block groups to create in our transaction
+        * handle's new_bgs list to avoid exhausting the chunk block reserve
+        * in extreme cases - like having a single transaction create many new
+        * block groups when starting to write out the free space caches of all
+        * the block groups that were made dirty during the lifetime of the
+        * transaction.
+        */
+       if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+               btrfs_create_pending_block_groups(trans, trans->root);
+               btrfs_trans_release_chunk_metadata(trans);
+       }
        return ret;
 }
 
@@ -4298,8 +4586,13 @@ out:
 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
                                        struct btrfs_fs_info *fs_info, u64 used)
 {
-       return (used >= div_factor_fine(space_info->total_bytes, 98) &&
-               !btrfs_fs_closing(fs_info) &&
+       u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+       /* If we're just plain full then async reclaim just slows us down. */
+       if (space_info->bytes_used >= thresh)
+               return 0;
+
+       return (used >= thresh && !btrfs_fs_closing(fs_info) &&
                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
 
@@ -4354,10 +4647,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                if (!btrfs_need_do_async_reclaim(space_info, fs_info,
                                                 flush_state))
                        return;
-       } while (flush_state <= COMMIT_TRANS);
-
-       if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
-               queue_work(system_unbound_wq, work);
+       } while (flush_state < COMMIT_TRANS);
 }
 
 void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -4700,6 +4990,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
        kfree(rsv);
 }
 
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+       kfree(rsv);
+}
+
 int btrfs_block_rsv_add(struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                        enum btrfs_reserve_flush_enum flush)
@@ -4812,10 +5107,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 
        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
                    csum_size * 2;
-       num_bytes += div64_u64(data_used + meta_used, 50);
+       num_bytes += div_u64(data_used + meta_used, 50);
 
        if (num_bytes * 3 > meta_used)
-               num_bytes = div64_u64(meta_used, 3);
+               num_bytes = div_u64(meta_used, 3);
 
        return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
 }
@@ -4912,6 +5207,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
        trans->bytes_reserved = 0;
 }
 
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+       struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+       if (!trans->chunk_bytes_reserved)
+               return;
+
+       WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+       block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+                               trans->chunk_bytes_reserved);
+       trans->chunk_bytes_reserved = 0;
+}
+
 /* Can only return 0 or -ENOSPC */
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                  struct inode *inode)
@@ -4998,8 +5311,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
                                      u64 qgroup_reserved)
 {
        btrfs_block_rsv_release(root, rsv, (u64)-1);
-       if (qgroup_reserved)
-               btrfs_qgroup_free(root, qgroup_reserved);
 }
 
 /**
@@ -5066,30 +5377,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
                                   int reserve)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       u64 csum_size;
-       int num_csums_per_leaf;
-       int num_csums;
-       int old_csums;
+       u64 old_csums, num_csums;
 
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
            BTRFS_I(inode)->csum_bytes == 0)
                return 0;
 
-       old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+       old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
        if (reserve)
                BTRFS_I(inode)->csum_bytes += num_bytes;
        else
                BTRFS_I(inode)->csum_bytes -= num_bytes;
-       csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
-       num_csums_per_leaf = (int)div64_u64(csum_size,
-                                           sizeof(struct btrfs_csum_item) +
-                                           sizeof(struct btrfs_disk_key));
-       num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
-       num_csums = num_csums + num_csums_per_leaf - 1;
-       num_csums = num_csums / num_csums_per_leaf;
-
-       old_csums = old_csums + num_csums_per_leaf - 1;
-       old_csums = old_csums / num_csums_per_leaf;
+       num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
 
        /* No change, no need to reserve more */
        if (old_csums == num_csums)
@@ -5163,8 +5462,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        spin_unlock(&BTRFS_I(inode)->lock);
 
        if (root->fs_info->quota_enabled) {
-               ret = btrfs_qgroup_reserve(root, num_bytes +
-                                          nr_extents * root->nodesize);
+               ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
                if (ret)
                        goto out_fail;
        }
@@ -5172,8 +5470,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (unlikely(ret)) {
                if (root->fs_info->quota_enabled)
-                       btrfs_qgroup_free(root, num_bytes +
-                                               nr_extents * root->nodesize);
+                       btrfs_qgroup_free(root, nr_extents * root->nodesize);
                goto out_fail;
        }
 
@@ -5290,10 +5587,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
-       if (root->fs_info->quota_enabled) {
-               btrfs_qgroup_free(root, num_bytes +
-                                       dropped * root->nodesize);
-       }
 
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
@@ -5318,7 +5611,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
        int ret;
 
-       ret = btrfs_check_data_free_space(inode, num_bytes);
+       ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
        if (ret)
                return ret;
 
@@ -5390,14 +5683,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
                        cache_block_group(cache, 1);
 
-               spin_lock(&trans->transaction->dirty_bgs_lock);
-               if (list_empty(&cache->dirty_list)) {
-                       list_add_tail(&cache->dirty_list,
-                                     &trans->transaction->dirty_bgs);
-                       btrfs_get_block_group(cache);
-               }
-               spin_unlock(&trans->transaction->dirty_bgs_lock);
-
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
 
@@ -5446,6 +5731,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                                spin_unlock(&info->unused_bgs_lock);
                        }
                }
+
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+               if (list_empty(&cache->dirty_list)) {
+                       list_add_tail(&cache->dirty_list,
+                                     &trans->transaction->dirty_bgs);
+                               trans->transaction->num_dirty_bgs++;
+                       btrfs_get_block_group(cache);
+               }
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+
                btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
@@ -5834,11 +6129,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
 
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
-                               u64 bytenr, u64 num_bytes, u64 parent,
+                               struct btrfs_delayed_ref_node *node, u64 parent,
                                u64 root_objectid, u64 owner_objectid,
                                u64 owner_offset, int refs_to_drop,
-                               struct btrfs_delayed_extent_op *extent_op,
-                               int no_quota)
+                               struct btrfs_delayed_extent_op *extent_op)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
@@ -5852,10 +6146,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        int extent_slot = 0;
        int found_extent = 0;
        int num_to_del = 1;
+       int no_quota = node->no_quota;
        u32 item_size;
        u64 refs;
+       u64 bytenr = node->bytenr;
+       u64 num_bytes = node->num_bytes;
        int last_ref = 0;
-       enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                 SKINNY_METADATA);
 
@@ -6036,7 +6332,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        refs -= refs_to_drop;
 
        if (refs > 0) {
-               type = BTRFS_QGROUP_OPER_SUB_SHARED;
                if (extent_op)
                        __run_delayed_extent_op(extent_op, leaf, ei);
                /*
@@ -6098,18 +6393,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        }
        btrfs_release_path(path);
 
-       /* Deal with the quota accounting */
-       if (!ret && last_ref && !no_quota) {
-               int mod_seq = 0;
-
-               if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-                   type == BTRFS_QGROUP_OPER_SUB_SHARED)
-                       mod_seq = 1;
-
-               ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
-                                             bytenr, num_bytes, type,
-                                             mod_seq);
-       }
 out:
        btrfs_free_path(path);
        return ret;
@@ -6135,7 +6418,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
                goto out_delayed_unlock;
 
        spin_lock(&head->lock);
-       if (rb_first(&head->ref_root))
+       if (!list_empty(&head->ref_list))
                goto out;
 
        if (head->extent_op) {
@@ -6956,15 +7239,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
                return -ENOSPC;
        }
 
-       if (btrfs_test_opt(root, DISCARD))
-               ret = btrfs_discard_extent(root, start, len, NULL);
-
        if (pin)
                pin_down_extent(root, cache, start, len, 1);
        else {
+               if (btrfs_test_opt(root, DISCARD))
+                       ret = btrfs_discard_extent(root, start, len, NULL);
                btrfs_add_free_space(cache, start, len);
                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
        }
+
        btrfs_put_block_group(cache);
 
        trace_btrfs_reserved_extent_free(root, start, len);
@@ -7045,13 +7328,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
 
-       /* Always set parent to 0 here since its exclusive anyway. */
-       ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                     ins->objectid, ins->offset,
-                                     BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-       if (ret)
-               return ret;
-
        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7095,9 +7371,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                      ins, size);
        if (ret) {
+               btrfs_free_path(path);
                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
                                                   root->nodesize);
-               btrfs_free_path(path);
                return ret;
        }
 
@@ -7133,14 +7409,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
 
-       if (!no_quota) {
-               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                             ins->objectid, num_bytes,
-                                             BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-               if (ret)
-                       return ret;
-       }
-
        ret = update_block_group(trans, root, ins->objectid, root->nodesize,
                                 1);
        if (ret) { /* -ENOENT, logic error */
@@ -7217,7 +7485,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
-       clean_tree_block(trans, root, buf);
+       clean_tree_block(trans, root->fs_info, buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
        btrfs_set_lock_blocking(buf);
@@ -7311,7 +7579,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
  * returns the key for the extent through ins, and a tree buffer for
  * the first block of the extent through buf.
  *
- * returns the tree buffer or NULL.
+ * returns the tree buffer or an ERR_PTR on error.
  */
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
@@ -7322,6 +7590,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
+       struct btrfs_delayed_extent_op *extent_op;
        u64 flags = 0;
        int ret;
        u32 blocksize = root->nodesize;
@@ -7342,13 +7611,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 
        ret = btrfs_reserve_extent(root, blocksize, blocksize,
                                   empty_size, hint, &ins, 0, 0);
-       if (ret) {
-               unuse_block_rsv(root->fs_info, block_rsv, blocksize);
-               return ERR_PTR(ret);
-       }
+       if (ret)
+               goto out_unuse;
 
        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
-       BUG_ON(IS_ERR(buf)); /* -ENOMEM */
+       if (IS_ERR(buf)) {
+               ret = PTR_ERR(buf);
+               goto out_free_reserved;
+       }
 
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -7358,9 +7628,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                BUG_ON(parent > 0);
 
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-               struct btrfs_delayed_extent_op *extent_op;
                extent_op = btrfs_alloc_delayed_extent_op();
-               BUG_ON(!extent_op); /* -ENOMEM */
+               if (!extent_op) {
+                       ret = -ENOMEM;
+                       goto out_free_buf;
+               }
                if (key)
                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
                else
@@ -7375,13 +7647,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                extent_op->level = level;
 
                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
-                                       ins.objectid,
-                                       ins.offset, parent, root_objectid,
-                                       level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op, 0);
-               BUG_ON(ret); /* -ENOMEM */
+                                                ins.objectid, ins.offset,
+                                                parent, root_objectid, level,
+                                                BTRFS_ADD_DELAYED_EXTENT,
+                                                extent_op, 0);
+               if (ret)
+                       goto out_free_delayed;
        }
        return buf;
+
+out_free_delayed:
+       btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+       free_extent_buffer(buf);
+out_free_reserved:
+       btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+out_unuse:
+       unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+       return ERR_PTR(ret);
 }
 
 struct walk_control {
@@ -7482,12 +7765,18 @@ reada:
        wc->reada_slot = slot;
 }
 
+/*
+ * TODO: Modify related function to add related node/leaf to dirty_extent_root,
+ * for later qgroup accounting.
+ *
+ * Current, this function does nothing.
+ */
 static int account_leaf_items(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct extent_buffer *eb)
 {
        int nr = btrfs_header_nritems(eb);
-       int i, extent_type, ret;
+       int i, extent_type;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        u64 bytenr, num_bytes;
@@ -7510,13 +7799,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
                        continue;
 
                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
-               ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-                                             root->objectid,
-                                             bytenr, num_bytes,
-                                             BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
-               if (ret)
-                       return ret;
        }
        return 0;
 }
@@ -7585,6 +7867,8 @@ static int adjust_slots_upwards(struct btrfs_root *root,
 
 /*
  * root_eb is the subtree root and is locked before this function is called.
+ * TODO: Modify this function to mark all (including complete shared node)
+ * to dirty_extent_root to allow it get accounted in qgroup.
  */
 static int account_shared_subtree(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
@@ -7647,7 +7931,11 @@ walk_down:
                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
 
                        eb = read_tree_block(root, child_bytenr, child_gen);
-                       if (!eb || !extent_buffer_uptodate(eb)) {
+                       if (IS_ERR(eb)) {
+                               ret = PTR_ERR(eb);
+                               goto out;
+                       } else if (!extent_buffer_uptodate(eb)) {
+                               free_extent_buffer(eb);
                                ret = -EIO;
                                goto out;
                        }
@@ -7658,16 +7946,6 @@ walk_down:
                        btrfs_tree_read_lock(eb);
                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
-                       ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-                                               root->objectid,
-                                               child_bytenr,
-                                               root->nodesize,
-                                               BTRFS_QGROUP_OPER_SUB_SUBTREE,
-                                               0);
-                       if (ret)
-                               goto out;
-
                }
 
                if (level == 0) {
@@ -7815,7 +8093,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
        blocksize = root->nodesize;
 
-       next = btrfs_find_tree_block(root, bytenr);
+       next = btrfs_find_tree_block(root->fs_info, bytenr);
        if (!next) {
                next = btrfs_find_create_tree_block(root, bytenr);
                if (!next)
@@ -7878,7 +8156,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
                next = read_tree_block(root, bytenr, generation);
-               if (!next || !extent_buffer_uptodate(next)) {
+               if (IS_ERR(next)) {
+                       return PTR_ERR(next);
+               } else if (!extent_buffer_uptodate(next)) {
                        free_extent_buffer(next);
                        return -EIO;
                }
@@ -8016,7 +8296,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        btrfs_set_lock_blocking(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                }
-               clean_tree_block(trans, root, eb);
+               clean_tree_block(trans, root->fs_info, eb);
        }
 
        if (eb == root->node) {
@@ -8260,24 +8540,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                goto out_end_trans;
                        }
 
-                       /*
-                        * Qgroup update accounting is run from
-                        * delayed ref handling. This usually works
-                        * out because delayed refs are normally the
-                        * only way qgroup updates are added. However,
-                        * we may have added updates during our tree
-                        * walk so run qgroups here to make sure we
-                        * don't lose any updates.
-                        */
-                       ret = btrfs_delayed_qgroup_accounting(trans,
-                                                             root->fs_info);
-                       if (ret)
-                               printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-                                                  "running qgroup updates "
-                                                  "during snapshot delete. "
-                                                  "Quota is out of sync, "
-                                                  "rescan required.\n", ret);
-
                        btrfs_end_transaction_throttle(trans, tree_root);
                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
                                pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8331,14 +8593,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
        root_dropped = true;
 out_end_trans:
-       ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
-       if (ret)
-               printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-                                  "running qgroup updates "
-                                  "during snapshot delete. "
-                                  "Quota is out of sync, "
-                                  "rescan required.\n", ret);
-
        btrfs_end_transaction_throttle(trans, tree_root);
 out_free:
        kfree(wc);
@@ -8533,10 +8787,48 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
        BUG_ON(cache->ro);
 
+again:
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
+       /*
+        * we're not allowed to set block groups readonly after the dirty
+        * block groups cache has started writing.  If it already started,
+        * back off and let this transaction commit
+        */
+       mutex_lock(&root->fs_info->ro_block_group_mutex);
+       if (trans->transaction->dirty_bg_run) {
+               u64 transid = trans->transid;
+
+               mutex_unlock(&root->fs_info->ro_block_group_mutex);
+               btrfs_end_transaction(trans, root);
+
+               ret = btrfs_wait_for_commit(root, transid);
+               if (ret)
+                       return ret;
+               goto again;
+       }
+
+       /*
+        * if we are changing raid levels, try to allocate a corresponding
+        * block group with the new raid level.
+        */
+       alloc_flags = update_block_group_flags(root, cache->flags);
+       if (alloc_flags != cache->flags) {
+               ret = do_chunk_alloc(trans, root, alloc_flags,
+                                    CHUNK_ALLOC_FORCE);
+               /*
+                * ENOSPC is allowed here, we may have enough space
+                * already allocated at the new raid level to
+                * carry on
+                */
+               if (ret == -ENOSPC)
+                       ret = 0;
+               if (ret < 0)
+                       goto out;
+       }
+
        ret = set_block_group_ro(cache, 0);
        if (!ret)
                goto out;
@@ -8549,8 +8841,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 out:
        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
                alloc_flags = update_block_group_flags(root, cache->flags);
+               lock_chunks(root->fs_info->chunk_root);
                check_system_chunk(trans, root, alloc_flags);
+               unlock_chunks(root->fs_info->chunk_root);
        }
+       mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
        btrfs_end_transaction(trans, root);
        return ret;
@@ -8720,7 +9015,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                min_free <<= 1;
        } else if (index == BTRFS_RAID_RAID0) {
                dev_min = fs_devices->rw_devices;
-               do_div(min_free, dev_min);
+               min_free = div64_u64(min_free, dev_min);
        }
 
        /* We need to do this so that we can look at pending chunks */
@@ -8992,6 +9287,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
        INIT_LIST_HEAD(&cache->dirty_list);
+       INIT_LIST_HEAD(&cache->io_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
 
@@ -9247,6 +9543,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
        free_excluded_extents(root, cache);
 
+       /*
+        * Call to ensure the corresponding space_info object is created and
+        * assigned to our block group, but don't update its counters just yet.
+        * We want our bg to be added to the rbtree with its ->space_info set.
+        */
+       ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+                               &cache->space_info);
+       if (ret) {
+               btrfs_remove_free_space_cache(cache);
+               btrfs_put_block_group(cache);
+               return ret;
+       }
+
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        if (ret) {
                btrfs_remove_free_space_cache(cache);
@@ -9254,6 +9563,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                return ret;
        }
 
+       /*
+        * Now that our block group has its ->space_info set and is inserted in
+        * the rbtree, update the space info's counters.
+        */
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        if (ret) {
@@ -9355,7 +9668,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                goto out;
        }
 
+       /*
+        * get the inode first so any iput calls done for the io_list
+        * aren't the final iput (no unlinks allowed now)
+        */
        inode = lookup_free_space_inode(tree_root, block_group, path);
+
+       mutex_lock(&trans->transaction->cache_write_mutex);
+       /*
+        * make sure our free spache cache IO is done before remove the
+        * free space inode
+        */
+       spin_lock(&trans->transaction->dirty_bgs_lock);
+       if (!list_empty(&block_group->io_list)) {
+               list_del_init(&block_group->io_list);
+
+               WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+               btrfs_wait_cache_io(root, trans, block_group,
+                                   &block_group->io_ctl, path,
+                                   block_group->key.objectid);
+               btrfs_put_block_group(block_group);
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+       }
+
+       if (!list_empty(&block_group->dirty_list)) {
+               list_del_init(&block_group->dirty_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&trans->transaction->dirty_bgs_lock);
+       mutex_unlock(&trans->transaction->cache_write_mutex);
+
        if (!IS_ERR(inode)) {
                ret = btrfs_orphan_add(trans, inode);
                if (ret) {
@@ -9448,18 +9792,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
        spin_lock(&trans->transaction->dirty_bgs_lock);
        if (!list_empty(&block_group->dirty_list)) {
-               list_del_init(&block_group->dirty_list);
-               btrfs_put_block_group(block_group);
+               WARN_ON(1);
+       }
+       if (!list_empty(&block_group->io_list)) {
+               WARN_ON(1);
        }
        spin_unlock(&trans->transaction->dirty_bgs_lock);
-
        btrfs_remove_free_space_cache(block_group);
 
        spin_lock(&block_group->space_info->lock);
        list_del_init(&block_group->ro_list);
+
+       if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+               WARN_ON(block_group->space_info->total_bytes
+                       < block_group->key.offset);
+               WARN_ON(block_group->space_info->bytes_readonly
+                       < block_group->key.offset);
+               WARN_ON(block_group->space_info->disk_total
+                       < block_group->key.offset * factor);
+       }
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        block_group->space_info->disk_total -= block_group->key.offset * factor;
+
        spin_unlock(&block_group->space_info->lock);
 
        memcpy(&key, &block_group->key, sizeof(key));
@@ -9574,6 +9929,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                }
                spin_unlock(&fs_info->unused_bgs_lock);
 
+               mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+
                /* Don't want to race with allocators so take the groups_sem */
                down_write(&space_info->groups_sem);
                spin_lock(&block_group->lock);
@@ -9647,8 +10004,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 
                /* Reset pinned so btrfs_put_block_group doesn't complain */
+               spin_lock(&space_info->lock);
+               spin_lock(&block_group->lock);
+
+               space_info->bytes_pinned -= block_group->pinned;
+               space_info->bytes_readonly += block_group->pinned;
+               percpu_counter_add(&space_info->total_bytes_pinned,
+                                  -block_group->pinned);
                block_group->pinned = 0;
 
+               spin_unlock(&block_group->lock);
+               spin_unlock(&space_info->lock);
+
                /*
                 * Btrfs_remove_chunk will abort the transaction if things go
                 * horribly wrong.
@@ -9658,6 +10025,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 end_trans:
                btrfs_end_transaction(trans, root);
 next:
+               mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
                btrfs_put_block_group(block_group);
                spin_lock(&fs_info->unused_bgs_lock);
        }