Btrfs: remove transaction from send
[cascardo/linux.git] / fs / btrfs / extent-tree.c
index 9c01509..4d2508b 100644 (file)
@@ -35,6 +35,7 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "math.h"
+#include "sysfs.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -418,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work)
 again:
        mutex_lock(&caching_ctl->mutex);
        /* need to make sure the commit_root doesn't disappear */
-       down_read(&fs_info->extent_commit_sem);
+       down_read(&fs_info->commit_root_sem);
 
 next:
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -441,10 +442,11 @@ next:
                        if (ret)
                                break;
 
-                       if (need_resched()) {
+                       if (need_resched() ||
+                           rwsem_is_contended(&fs_info->commit_root_sem)) {
                                caching_ctl->progress = last;
                                btrfs_release_path(path);
-                               up_read(&fs_info->extent_commit_sem);
+                               up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
                                cond_resched();
                                goto again;
@@ -511,7 +513,7 @@ next:
 
 err:
        btrfs_free_path(path);
-       up_read(&fs_info->extent_commit_sem);
+       up_read(&fs_info->commit_root_sem);
 
        free_excluded_extents(extent_root, block_group);
 
@@ -547,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        caching_ctl->block_group = cache;
        caching_ctl->progress = cache->key.objectid;
        atomic_set(&caching_ctl->count, 1);
-       caching_ctl->work.func = caching_thread;
+       btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 
        spin_lock(&cache->lock);
        /*
@@ -631,14 +633,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                return 0;
        }
 
-       down_write(&fs_info->extent_commit_sem);
+       down_write(&fs_info->commit_root_sem);
        atomic_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
-       up_write(&fs_info->extent_commit_sem);
+       up_write(&fs_info->commit_root_sem);
 
        btrfs_get_block_group(cache);
 
-       btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
+       btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 
        return ret;
 }
@@ -855,12 +857,14 @@ again:
                        btrfs_put_delayed_ref(&head->node);
                        goto search_again;
                }
+               spin_lock(&head->lock);
                if (head->extent_op && head->extent_op->update_flags)
                        extent_flags |= head->extent_op->flags_to_set;
                else
                        BUG_ON(num_refs == 0);
 
                num_refs += head->node.ref_mod;
+               spin_unlock(&head->lock);
                mutex_unlock(&head->mutex);
        }
        spin_unlock(&delayed_refs->lock);
@@ -1070,11 +1074,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
        __le64 lenum;
 
        lenum = cpu_to_le64(root_objectid);
-       high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+       high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(owner);
-       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(offset);
-       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 
        return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -2285,64 +2289,62 @@ static noinline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
        struct rb_node *node;
-       struct btrfs_delayed_ref_node *ref;
-       int action = BTRFS_ADD_DELAYED_REF;
-again:
+       struct btrfs_delayed_ref_node *ref, *last = NULL;;
+
        /*
         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
         * this prevents ref count from going down to zero when
         * there still are pending delayed ref.
         */
-       node = rb_prev(&head->node.rb_node);
-       while (1) {
-               if (!node)
-                       break;
+       node = rb_first(&head->ref_root);
+       while (node) {
                ref = rb_entry(node, struct btrfs_delayed_ref_node,
                                rb_node);
-               if (ref->bytenr != head->node.bytenr)
-                       break;
-               if (ref->action == action)
+               if (ref->action == BTRFS_ADD_DELAYED_REF)
                        return ref;
-               node = rb_prev(node);
-       }
-       if (action == BTRFS_ADD_DELAYED_REF) {
-               action = BTRFS_DROP_DELAYED_REF;
-               goto again;
+               else if (last == NULL)
+                       last = ref;
+               node = rb_next(node);
        }
-       return NULL;
+       return last;
 }
 
 /*
  * Returns 0 on success or if called with an already aborted transaction.
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
-static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
-                                      struct list_head *cluster)
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            unsigned long nr)
 {
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
        struct btrfs_fs_info *fs_info = root->fs_info;
+       ktime_t start = ktime_get();
        int ret;
-       int count = 0;
+       unsigned long count = 0;
+       unsigned long actual_count = 0;
        int must_insert_reserved = 0;
 
        delayed_refs = &trans->transaction->delayed_refs;
        while (1) {
                if (!locked_ref) {
-                       /* pick a new head ref from the cluster list */
-                       if (list_empty(cluster))
+                       if (count >= nr)
                                break;
 
-                       locked_ref = list_entry(cluster->next,
-                                    struct btrfs_delayed_ref_head, cluster);
+                       spin_lock(&delayed_refs->lock);
+                       locked_ref = btrfs_select_ref_head(trans);
+                       if (!locked_ref) {
+                               spin_unlock(&delayed_refs->lock);
+                               break;
+                       }
 
                        /* grab the lock that says we are going to process
                         * all the refs for this head */
                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
-
+                       spin_unlock(&delayed_refs->lock);
                        /*
                         * we may have dropped the spin lock to get the head
                         * mutex lock, and that might have given someone else
@@ -2363,6 +2365,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                 * finish.  If we merged anything we need to re-loop so we can
                 * get a good ref.
                 */
+               spin_lock(&locked_ref->lock);
                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
                                         locked_ref);
 
@@ -2374,17 +2377,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
                if (ref && ref->seq &&
                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
-                       /*
-                        * there are still refs with lower seq numbers in the
-                        * process of being added. Don't run this ref yet.
-                        */
-                       list_del_init(&locked_ref->cluster);
+                       spin_unlock(&locked_ref->lock);
                        btrfs_delayed_ref_unlock(locked_ref);
-                       locked_ref = NULL;
+                       spin_lock(&delayed_refs->lock);
+                       locked_ref->processing = 0;
                        delayed_refs->num_heads_ready++;
                        spin_unlock(&delayed_refs->lock);
+                       locked_ref = NULL;
                        cond_resched();
-                       spin_lock(&delayed_refs->lock);
+                       count++;
                        continue;
                }
 
@@ -2399,6 +2400,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                locked_ref->extent_op = NULL;
 
                if (!ref) {
+
+
                        /* All delayed refs have been processed, Go ahead
                         * and send the head node to run_one_delayed_ref,
                         * so that any accounting fixes can happen
@@ -2411,8 +2414,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        }
 
                        if (extent_op) {
-                               spin_unlock(&delayed_refs->lock);
-
+                               spin_unlock(&locked_ref->lock);
                                ret = run_delayed_extent_op(trans, root,
                                                            ref, extent_op);
                                btrfs_free_delayed_extent_op(extent_op);
@@ -2426,19 +2428,40 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                         */
                                        if (must_insert_reserved)
                                                locked_ref->must_insert_reserved = 1;
+                                       locked_ref->processing = 0;
                                        btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
-                                       spin_lock(&delayed_refs->lock);
                                        btrfs_delayed_ref_unlock(locked_ref);
                                        return ret;
                                }
+                               continue;
+                       }
 
-                               goto next;
+                       /*
+                        * Need to drop our head ref lock and re-aqcuire the
+                        * delayed ref lock and then re-check to make sure
+                        * nobody got added.
+                        */
+                       spin_unlock(&locked_ref->lock);
+                       spin_lock(&delayed_refs->lock);
+                       spin_lock(&locked_ref->lock);
+                       if (rb_first(&locked_ref->ref_root) ||
+                           locked_ref->extent_op) {
+                               spin_unlock(&locked_ref->lock);
+                               spin_unlock(&delayed_refs->lock);
+                               continue;
                        }
+                       ref->in_tree = 0;
+                       delayed_refs->num_heads--;
+                       rb_erase(&locked_ref->href_node,
+                                &delayed_refs->href_root);
+                       spin_unlock(&delayed_refs->lock);
+               } else {
+                       actual_count++;
+                       ref->in_tree = 0;
+                       rb_erase(&ref->rb_node, &locked_ref->ref_root);
                }
+               atomic_dec(&delayed_refs->num_entries);
 
-               ref->in_tree = 0;
-               rb_erase(&ref->rb_node, &delayed_refs->root);
-               delayed_refs->num_entries--;
                if (!btrfs_delayed_ref_is_head(ref)) {
                        /*
                         * when we play the delayed ref, also correct the
@@ -2455,20 +2478,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        default:
                                WARN_ON(1);
                        }
-               } else {
-                       list_del_init(&locked_ref->cluster);
                }
-               spin_unlock(&delayed_refs->lock);
+               spin_unlock(&locked_ref->lock);
 
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
 
                btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
+                       locked_ref->processing = 0;
                        btrfs_delayed_ref_unlock(locked_ref);
                        btrfs_put_delayed_ref(ref);
                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
-                       spin_lock(&delayed_refs->lock);
                        return ret;
                }
 
@@ -2484,11 +2505,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
                btrfs_put_delayed_ref(ref);
                count++;
-next:
                cond_resched();
+       }
+
+       /*
+        * We don't want to include ref heads since we can have empty ref heads
+        * and those will drastically skew our runtime down since we just do
+        * accounting, no actual extent tree updates.
+        */
+       if (actual_count > 0) {
+               u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+               u64 avg;
+
+               /*
+                * We weigh the current average higher than our current runtime
+                * to avoid large swings in the average.
+                */
                spin_lock(&delayed_refs->lock);
+               avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+               avg = div64_u64(avg, 4);
+               fs_info->avg_delayed_ref_runtime = avg;
+               spin_unlock(&delayed_refs->lock);
        }
-       return count;
+       return 0;
 }
 
 #ifdef SCRAMBLE_DELAYED_REFS
@@ -2570,16 +2609,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
-                     int count)
-{
-       int val = atomic_read(&delayed_refs->ref_seq);
-
-       if (val < seq || val >= seq + count)
-               return 1;
-       return 0;
-}
-
 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 {
        u64 num_bytes;
@@ -2596,7 +2625,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root)
 {
        struct btrfs_block_rsv *global_rsv;
@@ -2625,6 +2654,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u64 num_entries =
+               atomic_read(&trans->transaction->delayed_refs.num_entries);
+       u64 avg_runtime;
+
+       smp_mb();
+       avg_runtime = fs_info->avg_delayed_ref_runtime;
+       if (num_entries * avg_runtime >= NSEC_PER_SEC)
+               return 1;
+
+       return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2640,13 +2685,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 {
        struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
-       struct btrfs_delayed_ref_node *ref;
-       struct list_head cluster;
+       struct btrfs_delayed_ref_head *head;
        int ret;
-       u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
-       int loops;
 
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2658,130 +2700,40 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
 
        delayed_refs = &trans->transaction->delayed_refs;
-       INIT_LIST_HEAD(&cluster);
        if (count == 0) {
-               count = delayed_refs->num_entries * 2;
+               count = atomic_read(&delayed_refs->num_entries) * 2;
                run_most = 1;
        }
 
-       if (!run_all && !run_most) {
-               int old;
-               int seq = atomic_read(&delayed_refs->ref_seq);
-
-progress:
-               old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
-               if (old) {
-                       DEFINE_WAIT(__wait);
-                       if (delayed_refs->flushing ||
-                           !btrfs_should_throttle_delayed_refs(trans, root))
-                               return 0;
-
-                       prepare_to_wait(&delayed_refs->wait, &__wait,
-                                       TASK_UNINTERRUPTIBLE);
-
-                       old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
-                       if (old) {
-                               schedule();
-                               finish_wait(&delayed_refs->wait, &__wait);
-
-                               if (!refs_newer(delayed_refs, seq, 256))
-                                       goto progress;
-                               else
-                                       return 0;
-                       } else {
-                               finish_wait(&delayed_refs->wait, &__wait);
-                               goto again;
-                       }
-               }
-
-       } else {
-               atomic_inc(&delayed_refs->procs_running_refs);
-       }
-
 again:
-       loops = 0;
-       spin_lock(&delayed_refs->lock);
-
 #ifdef SCRAMBLE_DELAYED_REFS
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
-
-       while (1) {
-               if (!(run_all || run_most) &&
-                   !btrfs_should_throttle_delayed_refs(trans, root))
-                       break;
-
-               /*
-                * go find something we can process in the rbtree.  We start at
-                * the beginning of the tree, and then build a cluster
-                * of refs to process starting at the first one we are able to
-                * lock
-                */
-               delayed_start = delayed_refs->run_delayed_start;
-               ret = btrfs_find_ref_cluster(trans, &cluster,
-                                            delayed_refs->run_delayed_start);
-               if (ret)
-                       break;
-
-               ret = run_clustered_refs(trans, root, &cluster);
-               if (ret < 0) {
-                       btrfs_release_ref_cluster(&cluster);
-                       spin_unlock(&delayed_refs->lock);
-                       btrfs_abort_transaction(trans, root, ret);
-                       atomic_dec(&delayed_refs->procs_running_refs);
-                       wake_up(&delayed_refs->wait);
-                       return ret;
-               }
-
-               atomic_add(ret, &delayed_refs->ref_seq);
-
-               count -= min_t(unsigned long, ret, count);
-
-               if (count == 0)
-                       break;
-
-               if (delayed_start >= delayed_refs->run_delayed_start) {
-                       if (loops == 0) {
-                               /*
-                                * btrfs_find_ref_cluster looped. let's do one
-                                * more cycle. if we don't run any delayed ref
-                                * during that cycle (because we can't because
-                                * all of them are blocked), bail out.
-                                */
-                               loops = 1;
-                       } else {
-                               /*
-                                * no runnable refs left, stop trying
-                                */
-                               BUG_ON(run_all);
-                               break;
-                       }
-               }
-               if (ret) {
-                       /* refs were run, let's reset staleness detection */
-                       loops = 0;
-               }
+       ret = __btrfs_run_delayed_refs(trans, root, count);
+       if (ret < 0) {
+               btrfs_abort_transaction(trans, root, ret);
+               return ret;
        }
 
        if (run_all) {
-               if (!list_empty(&trans->new_bgs)) {
-                       spin_unlock(&delayed_refs->lock);
+               if (!list_empty(&trans->new_bgs))
                        btrfs_create_pending_block_groups(trans, root);
-                       spin_lock(&delayed_refs->lock);
-               }
 
-               node = rb_first(&delayed_refs->root);
-               if (!node)
+               spin_lock(&delayed_refs->lock);
+               node = rb_first(&delayed_refs->href_root);
+               if (!node) {
+                       spin_unlock(&delayed_refs->lock);
                        goto out;
+               }
                count = (unsigned long)-1;
 
                while (node) {
-                       ref = rb_entry(node, struct btrfs_delayed_ref_node,
-                                      rb_node);
-                       if (btrfs_delayed_ref_is_head(ref)) {
-                               struct btrfs_delayed_ref_head *head;
+                       head = rb_entry(node, struct btrfs_delayed_ref_head,
+                                       href_node);
+                       if (btrfs_delayed_ref_is_head(&head->node)) {
+                               struct btrfs_delayed_ref_node *ref;
 
-                               head = btrfs_delayed_node_to_head(ref);
+                               ref = &head->node;
                                atomic_inc(&ref->refs);
 
                                spin_unlock(&delayed_refs->lock);
@@ -2795,20 +2747,16 @@ again:
                                btrfs_put_delayed_ref(ref);
                                cond_resched();
                                goto again;
+                       } else {
+                               WARN_ON(1);
                        }
                        node = rb_next(node);
                }
                spin_unlock(&delayed_refs->lock);
-               schedule_timeout(1);
+               cond_resched();
                goto again;
        }
 out:
-       atomic_dec(&delayed_refs->procs_running_refs);
-       smp_mb();
-       if (waitqueue_active(&delayed_refs->wait))
-               wake_up(&delayed_refs->wait);
-
-       spin_unlock(&delayed_refs->lock);
        assert_qgroups_uptodate(trans);
        return 0;
 }
@@ -2850,12 +2798,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
        struct rb_node *node;
        int ret = 0;
 
-       ret = -ENOENT;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
        head = btrfs_find_delayed_ref_head(trans, bytenr);
-       if (!head)
-               goto out;
+       if (!head) {
+               spin_unlock(&delayed_refs->lock);
+               return 0;
+       }
 
        if (!mutex_trylock(&head->mutex)) {
                atomic_inc(&head->node.refs);
@@ -2872,40 +2821,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
                btrfs_put_delayed_ref(&head->node);
                return -EAGAIN;
        }
+       spin_unlock(&delayed_refs->lock);
 
-       node = rb_prev(&head->node.rb_node);
-       if (!node)
-               goto out_unlock;
-
-       ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
-       if (ref->bytenr != bytenr)
-               goto out_unlock;
-
-       ret = 1;
-       if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
-               goto out_unlock;
+       spin_lock(&head->lock);
+       node = rb_first(&head->ref_root);
+       while (node) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               node = rb_next(node);
 
-       data_ref = btrfs_delayed_node_to_data_ref(ref);
+               /* If it's a shared ref we know a cross reference exists */
+               if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+                       ret = 1;
+                       break;
+               }
 
-       node = rb_prev(node);
-       if (node) {
-               int seq = ref->seq;
+               data_ref = btrfs_delayed_node_to_data_ref(ref);
 
-               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-               if (ref->bytenr == bytenr && ref->seq == seq)
-                       goto out_unlock;
+               /*
+                * If our ref doesn't match the one we're currently looking at
+                * then we have a cross reference.
+                */
+               if (data_ref->root != root->root_key.objectid ||
+                   data_ref->objectid != objectid ||
+                   data_ref->offset != offset) {
+                       ret = 1;
+                       break;
+               }
        }
-
-       if (data_ref->root != root->root_key.objectid ||
-           data_ref->objectid != objectid || data_ref->offset != offset)
-               goto out_unlock;
-
-       ret = 0;
-out_unlock:
+       spin_unlock(&head->lock);
        mutex_unlock(&head->mutex);
-out:
-       spin_unlock(&delayed_refs->lock);
        return ret;
 }
 
@@ -3402,6 +3346,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
        return readonly;
 }
 
+static const char *alloc_name(u64 flags)
+{
+       switch (flags) {
+       case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
+               return "mixed";
+       case BTRFS_BLOCK_GROUP_METADATA:
+               return "metadata";
+       case BTRFS_BLOCK_GROUP_DATA:
+               return "data";
+       case BTRFS_BLOCK_GROUP_SYSTEM:
+               return "system";
+       default:
+               WARN_ON(1);
+               return "invalid-combination";
+       };
+}
+
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                             u64 total_bytes, u64 bytes_used,
                             struct btrfs_space_info **space_info)
@@ -3439,8 +3400,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                return ret;
        }
 
-       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
                INIT_LIST_HEAD(&found->block_groups[i]);
+               kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
+       }
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
        found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -3457,11 +3420,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->chunk_alloc = 0;
        found->flush = 0;
        init_waitqueue_head(&found->wait);
+
+       ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+                                   info->space_info_kobj, "%s",
+                                   alloc_name(found->flags));
+       if (ret) {
+               kfree(found);
+               return ret;
+       }
+
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                info->data_sinfo = found;
-       return 0;
+
+       return ret;
 }
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
@@ -3999,7 +3972,7 @@ static int can_overcommit(struct btrfs_root *root,
 }
 
 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
-                                        unsigned long nr_pages)
+                                        unsigned long nr_pages, int nr_items)
 {
        struct super_block *sb = root->fs_info->sb;
 
@@ -4014,9 +3987,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                 * the filesystem is readonly(all dirty pages are written to
                 * the disk).
                 */
-               btrfs_start_delalloc_roots(root->fs_info, 0);
+               btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
                if (!current->journal_info)
-                       btrfs_wait_ordered_roots(root->fs_info, -1);
+                       btrfs_wait_ordered_roots(root->fs_info, nr_items);
        }
 }
 
@@ -4073,7 +4046,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               btrfs_writeback_inodes_sb_nr(root, nr_pages);
+               btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
                /*
                 * We need to wait for the async pages to actually start before
                 * we do anything.
@@ -4140,13 +4113,9 @@ static int may_commit_transaction(struct btrfs_root *root,
                goto commit;
 
        /* See if there is enough pinned space to make this reservation */
-       spin_lock(&space_info->lock);
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
-                                  bytes) >= 0) {
-               spin_unlock(&space_info->lock);
+                                  bytes) >= 0)
                goto commit;
-       }
-       spin_unlock(&space_info->lock);
 
        /*
         * See if there is some space in the delayed insertion reservation for
@@ -4155,16 +4124,13 @@ static int may_commit_transaction(struct btrfs_root *root,
        if (space_info != delayed_rsv->space_info)
                return -ENOSPC;
 
-       spin_lock(&space_info->lock);
        spin_lock(&delayed_rsv->lock);
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
                                   bytes - delayed_rsv->size) >= 0) {
                spin_unlock(&delayed_rsv->lock);
-               spin_unlock(&space_info->lock);
                return -ENOSPC;
        }
        spin_unlock(&delayed_rsv->lock);
-       spin_unlock(&space_info->lock);
 
 commit:
        trans = btrfs_join_transaction(root);
@@ -4209,7 +4175,7 @@ static int flush_space(struct btrfs_root *root,
                break;
        case FLUSH_DELALLOC:
        case FLUSH_DELALLOC_WAIT:
-               shrink_delalloc(root, num_bytes, orig_bytes,
+               shrink_delalloc(root, num_bytes * 2, orig_bytes,
                                state == FLUSH_DELALLOC_WAIT);
                break;
        case ALLOC_CHUNK:
@@ -4637,7 +4603,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
                             u64 num_bytes)
 {
        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
-       if (global_rsv->full || global_rsv == block_rsv ||
+       if (global_rsv == block_rsv ||
            block_rsv->space_info != global_rsv->space_info)
                global_rsv = NULL;
        block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
@@ -5505,7 +5471,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *cache;
        struct btrfs_space_info *space_info;
 
-       down_write(&fs_info->extent_commit_sem);
+       down_write(&fs_info->commit_root_sem);
 
        list_for_each_entry_safe(caching_ctl, next,
                                 &fs_info->caching_block_groups, list) {
@@ -5524,7 +5490,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        else
                fs_info->pinned_extents = &fs_info->freed_extents[0];
 
-       up_write(&fs_info->extent_commit_sem);
+       up_write(&fs_info->commit_root_sem);
 
        list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
                percpu_counter_set(&space_info->total_bytes_pinned, 0);
@@ -5916,24 +5882,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 {
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_root *delayed_refs;
-       struct btrfs_delayed_ref_node *ref;
-       struct rb_node *node;
        int ret = 0;
 
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
        head = btrfs_find_delayed_ref_head(trans, bytenr);
        if (!head)
-               goto out;
+               goto out_delayed_unlock;
 
-       node = rb_prev(&head->node.rb_node);
-       if (!node)
-               goto out;
-
-       ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
-       /* there are still entries for this ref, we can't drop it */
-       if (ref->bytenr == bytenr)
+       spin_lock(&head->lock);
+       if (rb_first(&head->ref_root))
                goto out;
 
        if (head->extent_op) {
@@ -5955,19 +5913,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
         * ahead and process it.
         */
        head->node.in_tree = 0;
-       rb_erase(&head->node.rb_node, &delayed_refs->root);
+       rb_erase(&head->href_node, &delayed_refs->href_root);
 
-       delayed_refs->num_entries--;
+       atomic_dec(&delayed_refs->num_entries);
 
        /*
         * we don't take a ref on the node because we're removing it from the
         * tree, so we just steal the ref the tree was holding.
         */
        delayed_refs->num_heads--;
-       if (list_empty(&head->cluster))
+       if (head->processing == 0)
                delayed_refs->num_heads_ready--;
-
-       list_del_init(&head->cluster);
+       head->processing = 0;
+       spin_unlock(&head->lock);
        spin_unlock(&delayed_refs->lock);
 
        BUG_ON(head->extent_op);
@@ -5978,6 +5936,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        btrfs_put_delayed_ref(&head->node);
        return ret;
 out:
+       spin_unlock(&head->lock);
+
+out_delayed_unlock:
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -6145,11 +6106,29 @@ int __get_raid_index(u64 flags)
        return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
        return __get_raid_index(cache->flags);
 }
 
+static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
+       [BTRFS_RAID_RAID10]     = "raid10",
+       [BTRFS_RAID_RAID1]      = "raid1",
+       [BTRFS_RAID_DUP]        = "dup",
+       [BTRFS_RAID_RAID0]      = "raid0",
+       [BTRFS_RAID_SINGLE]     = "single",
+       [BTRFS_RAID_RAID5]      = "raid5",
+       [BTRFS_RAID_RAID6]      = "raid6",
+};
+
+static const char *get_raid_name(enum btrfs_raid_types type)
+{
+       if (type >= BTRFS_NR_RAID_TYPES)
+               return NULL;
+
+       return btrfs_raid_type_names[type];
+}
+
 enum btrfs_loop_type {
        LOOP_CACHING_NOWAIT = 0,
        LOOP_CACHING_WAIT = 1,
@@ -6177,7 +6156,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        struct btrfs_root *root = orig_root->fs_info->extent_root;
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
-       struct btrfs_block_group_cache *used_block_group;
        u64 search_start = 0;
        u64 max_extent_size = 0;
        int empty_cluster = 2 * 1024 * 1024;
@@ -6186,7 +6164,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        int index = __get_raid_index(flags);
        int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
-       bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
@@ -6239,7 +6216,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-               used_block_group = block_group;
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
@@ -6276,7 +6252,6 @@ search:
                u64 offset;
                int cached;
 
-               used_block_group = block_group;
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
 
@@ -6304,7 +6279,6 @@ search:
 have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
-                       found_uncached_bg = true;
                        ret = cache_block_group(block_group, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
@@ -6320,6 +6294,7 @@ have_block_group:
                 * lets look there
                 */
                if (last_ptr) {
+                       struct btrfs_block_group_cache *used_block_group;
                        unsigned long aligned_cluster;
                        /*
                         * the refill lock keeps out other
@@ -6330,10 +6305,8 @@ have_block_group:
                        if (used_block_group != block_group &&
                            (!used_block_group ||
                             used_block_group->ro ||
-                            !block_group_bits(used_block_group, flags))) {
-                               used_block_group = block_group;
+                            !block_group_bits(used_block_group, flags)))
                                goto refill_cluster;
-                       }
 
                        if (used_block_group != block_group)
                                btrfs_get_block_group(used_block_group);
@@ -6347,17 +6320,19 @@ have_block_group:
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
                                trace_btrfs_reserve_extent_cluster(root,
-                                       block_group, search_start, num_bytes);
+                                               used_block_group,
+                                               search_start, num_bytes);
+                               if (used_block_group != block_group) {
+                                       btrfs_put_block_group(block_group);
+                                       block_group = used_block_group;
+                               }
                                goto checks;
                        }
 
                        WARN_ON(last_ptr->block_group != used_block_group);
-                       if (used_block_group != block_group) {
+                       if (used_block_group != block_group)
                                btrfs_put_block_group(used_block_group);
-                               used_block_group = block_group;
-                       }
 refill_cluster:
-                       BUG_ON(used_block_group != block_group);
                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
                         * set up a new clusters, so lets just skip it
                         * and let the allocator find whatever block
@@ -6476,25 +6451,25 @@ unclustered_alloc:
                        goto loop;
                }
 checks:
-               search_start = stripe_align(root, used_block_group,
+               search_start = stripe_align(root, block_group,
                                            offset, num_bytes);
 
                /* move on to the next group */
                if (search_start + num_bytes >
-                   used_block_group->key.objectid + used_block_group->key.offset) {
-                       btrfs_add_free_space(used_block_group, offset, num_bytes);
+                   block_group->key.objectid + block_group->key.offset) {
+                       btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
                }
 
                if (offset < search_start)
-                       btrfs_add_free_space(used_block_group, offset,
+                       btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
 
-               ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
+               ret = btrfs_update_reserved_bytes(block_group, num_bytes,
                                                  alloc_type);
                if (ret == -EAGAIN) {
-                       btrfs_add_free_space(used_block_group, offset, num_bytes);
+                       btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
                }
 
@@ -6504,16 +6479,12 @@ checks:
 
                trace_btrfs_reserve_extent(orig_root, block_group,
                                           search_start, num_bytes);
-               if (used_block_group != block_group)
-                       btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
                BUG_ON(index != get_block_group_index(block_group));
-               if (used_block_group != block_group)
-                       btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
@@ -6584,12 +6555,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
        int index = 0;
 
        spin_lock(&info->lock);
-       printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+       printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
               info->flags,
               info->total_bytes - info->bytes_used - info->bytes_pinned -
               info->bytes_reserved - info->bytes_readonly,
               (info->full) ? "" : "not ");
-       printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+       printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
               "reserved=%llu, may_use=%llu, readonly=%llu\n",
               info->total_bytes, info->bytes_used, info->bytes_pinned,
               info->bytes_reserved, info->bytes_may_use,
@@ -6603,7 +6574,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 again:
        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
-               printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
+               printk(KERN_INFO "BTRFS: "
+                          "block group %llu has %llu bytes, "
+                          "%llu used %llu pinned %llu reserved %s\n",
                       cache->key.objectid, cache->key.offset,
                       btrfs_block_group_used(&cache->item), cache->pinned,
                       cache->reserved, cache->ro ? "[readonly]" : "");
@@ -6966,7 +6939,7 @@ again:
                                /*DEFAULT_RATELIMIT_BURST*/ 1);
                if (__ratelimit(&_rs))
                        WARN(1, KERN_DEBUG
-                               "btrfs: block rsv returned %d\n", ret);
+                               "BTRFS: block rsv returned %d\n", ret);
        }
 try_reserve:
        ret = reserve_metadata_bytes(root, block_rsv, blocksize,
@@ -7714,7 +7687,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 
                        btrfs_end_transaction_throttle(trans, tree_root);
                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
-                               pr_debug("btrfs: drop snapshot early exit\n");
+                               pr_debug("BTRFS: drop snapshot early exit\n");
                                err = -EAGAIN;
                                goto out_free;
                        }
@@ -7779,7 +7752,7 @@ out:
         */
        if (!for_reloc && root_dropped == false)
                btrfs_add_dead_root(root);
-       if (err)
+       if (err && err != -EAGAIN)
                btrfs_std_error(root->fs_info, err);
        return err;
 }
@@ -8283,14 +8256,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        struct btrfs_caching_control *caching_ctl;
        struct rb_node *n;
 
-       down_write(&info->extent_commit_sem);
+       down_write(&info->commit_root_sem);
        while (!list_empty(&info->caching_block_groups)) {
                caching_ctl = list_entry(info->caching_block_groups.next,
                                         struct btrfs_caching_control, list);
                list_del(&caching_ctl->list);
                put_caching_control(caching_ctl);
        }
-       up_write(&info->extent_commit_sem);
+       up_write(&info->commit_root_sem);
 
        spin_lock(&info->block_group_cache_lock);
        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8333,6 +8306,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        release_global_block_rsv(info);
 
        while (!list_empty(&info->space_info)) {
+               int i;
+
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
@@ -8343,9 +8318,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                                dump_space_info(space_info, 0, 0);
                        }
                }
-               percpu_counter_destroy(&space_info->total_bytes_pinned);
                list_del(&space_info->list);
-               kfree(space_info);
+               for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+                       struct kobject *kobj;
+                       kobj = &space_info->block_group_kobjs[i];
+                       if (kobj->parent) {
+                               kobject_del(kobj);
+                               kobject_put(kobj);
+                       }
+               }
+               kobject_del(&space_info->kobj);
+               kobject_put(&space_info->kobj);
        }
        return 0;
 }
@@ -8356,10 +8339,57 @@ static void __link_block_group(struct btrfs_space_info *space_info,
        int index = get_block_group_index(cache);
 
        down_write(&space_info->groups_sem);
+       if (list_empty(&space_info->block_groups[index])) {
+               struct kobject *kobj = &space_info->block_group_kobjs[index];
+               int ret;
+
+               kobject_get(&space_info->kobj); /* put in release */
+               ret = kobject_add(kobj, &space_info->kobj, "%s",
+                                 get_raid_name(index));
+               if (ret) {
+                       pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
+                       kobject_put(&space_info->kobj);
+               }
+       }
        list_add_tail(&cache->list, &space_info->block_groups[index]);
        up_write(&space_info->groups_sem);
 }
 
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+       struct btrfs_block_group_cache *cache;
+
+       cache = kzalloc(sizeof(*cache), GFP_NOFS);
+       if (!cache)
+               return NULL;
+
+       cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                       GFP_NOFS);
+       if (!cache->free_space_ctl) {
+               kfree(cache);
+               return NULL;
+       }
+
+       cache->key.objectid = start;
+       cache->key.offset = size;
+       cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+       cache->sectorsize = root->sectorsize;
+       cache->fs_info = root->fs_info;
+       cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                              &root->fs_info->mapping_tree,
+                                              start);
+       atomic_set(&cache->count, 1);
+       spin_lock_init(&cache->lock);
+       INIT_LIST_HEAD(&cache->list);
+       INIT_LIST_HEAD(&cache->cluster_list);
+       INIT_LIST_HEAD(&cache->new_bg_list);
+       btrfs_init_free_space_ctl(cache);
+
+       return cache;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -8395,26 +8425,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        break;
                if (ret != 0)
                        goto error;
+
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-               cache = kzalloc(sizeof(*cache), GFP_NOFS);
+
+               cache = btrfs_create_block_group_cache(root, found_key.objectid,
+                                                      found_key.offset);
                if (!cache) {
                        ret = -ENOMEM;
                        goto error;
                }
-               cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                               GFP_NOFS);
-               if (!cache->free_space_ctl) {
-                       kfree(cache);
-                       ret = -ENOMEM;
-                       goto error;
-               }
-
-               atomic_set(&cache->count, 1);
-               spin_lock_init(&cache->lock);
-               cache->fs_info = info;
-               INIT_LIST_HEAD(&cache->list);
-               INIT_LIST_HEAD(&cache->cluster_list);
 
                if (need_clear) {
                        /*
@@ -8435,16 +8455,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
-               memcpy(&cache->key, &found_key, sizeof(found_key));
+               cache->flags = btrfs_block_group_flags(&cache->item);
 
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(path);
-               cache->flags = btrfs_block_group_flags(&cache->item);
-               cache->sectorsize = root->sectorsize;
-               cache->full_stripe_len = btrfs_full_stripe_len(root,
-                                              &root->fs_info->mapping_tree,
-                                              found_key.objectid);
-               btrfs_init_free_space_ctl(cache);
 
                /*
                 * We need to exclude the super stripes now so that the space
@@ -8458,8 +8472,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                         * case.
                         */
                        free_excluded_extents(root, cache);
-                       kfree(cache->free_space_ctl);
-                       kfree(cache);
+                       btrfs_put_block_group(cache);
                        goto error;
                }
 
@@ -8590,38 +8603,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
        root->fs_info->last_trans_log_full_commit = trans->transid;
 
-       cache = kzalloc(sizeof(*cache), GFP_NOFS);
+       cache = btrfs_create_block_group_cache(root, chunk_offset, size);
        if (!cache)
                return -ENOMEM;
-       cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                       GFP_NOFS);
-       if (!cache->free_space_ctl) {
-               kfree(cache);
-               return -ENOMEM;
-       }
-
-       cache->key.objectid = chunk_offset;
-       cache->key.offset = size;
-       cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-       cache->sectorsize = root->sectorsize;
-       cache->fs_info = root->fs_info;
-       cache->full_stripe_len = btrfs_full_stripe_len(root,
-                                              &root->fs_info->mapping_tree,
-                                              chunk_offset);
-
-       atomic_set(&cache->count, 1);
-       spin_lock_init(&cache->lock);
-       INIT_LIST_HEAD(&cache->list);
-       INIT_LIST_HEAD(&cache->cluster_list);
-       INIT_LIST_HEAD(&cache->new_bg_list);
-
-       btrfs_init_free_space_ctl(cache);
 
        btrfs_set_block_group_used(&cache->item, bytes_used);
        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
-       cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
 
+       cache->flags = type;
        cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
        ret = exclude_super_stripes(root, cache);
@@ -8631,8 +8621,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                 * case.
                 */
                free_excluded_extents(root, cache);
-               kfree(cache->free_space_ctl);
-               kfree(cache);
+               btrfs_put_block_group(cache);
                return ret;
        }
 
@@ -8796,8 +8785,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
-       if (list_empty(&block_group->space_info->block_groups[index]))
+       if (list_empty(&block_group->space_info->block_groups[index])) {
+               kobject_del(&block_group->space_info->block_group_kobjs[index]);
+               kobject_put(&block_group->space_info->block_group_kobjs[index]);
                clear_avail_alloc_bits(root->fs_info, block_group->flags);
+       }
        up_write(&block_group->space_info->groups_sem);
 
        if (block_group->cached == BTRFS_CACHE_STARTED)
@@ -8940,3 +8932,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
        range->len = trimmed;
        return ret;
 }
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+       percpu_counter_dec(&root->subv_writers->counter);
+       /*
+        * Make sure counter is updated before we wake up
+        * waiters.
+        */
+       smp_mb();
+       if (waitqueue_active(&root->subv_writers->wait))
+               wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+       if (unlikely(atomic_read(&root->will_be_snapshoted)))
+               return 0;
+
+       percpu_counter_inc(&root->subv_writers->counter);
+       /*
+        * Make sure counter is updated before we check for snapshot creation.
+        */
+       smp_mb();
+       if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+               btrfs_end_nocow_write(root);
+               return 0;
+       }
+       return 1;
+}