btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans
[cascardo/linux.git] / fs / btrfs / extent-tree.c
index 9f96042..f50c7c2 100644 (file)
@@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     u64 parent, u64 root_objectid,
                                     u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins,
-                                    int no_quota);
+                                    int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 flags,
                          int force);
@@ -332,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
                kfree(ctl);
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+                               struct btrfs_block_group_cache *block_group)
+{
+       u64 start = block_group->key.objectid;
+       u64 len = block_group->key.offset;
+       u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+               root->nodesize : root->sectorsize;
+       u64 step = chunk << 1;
+
+       while (len > chunk) {
+               btrfs_remove_free_space(block_group, start, chunk);
+               start += step;
+               if (len < step)
+                       len = 0;
+               else
+                       len -= step;
+       }
+}
+#endif
+
 /*
  * this is only called by cache_block_group, since we could have freed extents
  * we need to check the pinned_extents for any extents that can't be used yet
@@ -388,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work)
        u64 last = 0;
        u32 nritems;
        int ret = -ENOMEM;
+       bool wakeup = true;
 
        caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
@@ -400,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work)
 
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
+#ifdef CONFIG_BTRFS_DEBUG
+       /*
+        * If we're fragmenting we don't want to make anybody think we can
+        * allocate from this block group until we've had a chance to fragment
+        * the free space.
+        */
+       if (btrfs_should_fragment_free_space(extent_root, block_group))
+               wakeup = false;
+#endif
        /*
         * We don't want to deadlock with somebody trying to allocate a new
         * extent for the extent root while also trying to search the extent
@@ -441,7 +471,8 @@ next:
 
                        if (need_resched() ||
                            rwsem_is_contended(&fs_info->commit_root_sem)) {
-                               caching_ctl->progress = last;
+                               if (wakeup)
+                                       caching_ctl->progress = last;
                                btrfs_release_path(path);
                                up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
@@ -464,7 +495,8 @@ next:
                        key.offset = 0;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
 
-                       caching_ctl->progress = last;
+                       if (wakeup)
+                               caching_ctl->progress = last;
                        btrfs_release_path(path);
                        goto next;
                }
@@ -491,7 +523,8 @@ next:
 
                        if (total_found > (1024 * 1024 * 2)) {
                                total_found = 0;
-                               wake_up(&caching_ctl->wait);
+                               if (wakeup)
+                                       wake_up(&caching_ctl->wait);
                        }
                }
                path->slots[0]++;
@@ -501,13 +534,27 @@ next:
        total_found += add_new_free_space(block_group, fs_info, last,
                                          block_group->key.objectid +
                                          block_group->key.offset);
-       caching_ctl->progress = (u64)-1;
-
        spin_lock(&block_group->lock);
        block_group->caching_ctl = NULL;
        block_group->cached = BTRFS_CACHE_FINISHED;
        spin_unlock(&block_group->lock);
 
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+               u64 bytes_used;
+
+               spin_lock(&block_group->space_info->lock);
+               spin_lock(&block_group->lock);
+               bytes_used = block_group->key.offset -
+                       btrfs_block_group_used(&block_group->item);
+               block_group->space_info->bytes_used += bytes_used >> 1;
+               spin_unlock(&block_group->lock);
+               spin_unlock(&block_group->space_info->lock);
+               fragment_free_space(extent_root, block_group);
+       }
+#endif
+
+       caching_ctl->progress = (u64)-1;
 err:
        btrfs_free_path(path);
        up_read(&fs_info->commit_root_sem);
@@ -607,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                        }
                }
                spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+               if (ret == 1 &&
+                   btrfs_should_fragment_free_space(fs_info->extent_root,
+                                                    cache)) {
+                       u64 bytes_used;
+
+                       spin_lock(&cache->space_info->lock);
+                       spin_lock(&cache->lock);
+                       bytes_used = cache->key.offset -
+                               btrfs_block_group_used(&cache->item);
+                       cache->space_info->bytes_used += bytes_used >> 1;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
+                       fragment_free_space(fs_info->extent_root, cache);
+               }
+#endif
                mutex_unlock(&caching_ctl->mutex);
 
                wake_up(&caching_ctl->wait);
@@ -2009,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset,
-                        int no_quota)
+                        u64 root_objectid, u64 owner, u64 offset)
 {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2022,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                        num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+                                       BTRFS_ADD_DELAYED_REF, NULL);
        } else {
                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-                                       num_bytes,
-                                       parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+                                       num_bytes, parent, root_objectid,
+                                       owner, offset, 0,
+                                       BTRFS_ADD_DELAYED_REF, NULL);
        }
        return ret;
 }
@@ -2048,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        u64 num_bytes = node->num_bytes;
        u64 refs;
        int ret;
-       int no_quota = node->no_quota;
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
-       if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
-               no_quota = 1;
-
        path->reada = 1;
        path->leave_spinning = 1;
        /* this will setup the path even if it fails to insert the back ref */
@@ -2291,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                parent, ref_root,
                                                extent_op->flags_to_set,
                                                &extent_op->key,
-                                               ref->level, &ins,
-                                               node->no_quota);
+                                               ref->level, &ins);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, root, node,
                                             parent, ref_root,
@@ -2345,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                                      node->num_bytes);
                        }
                }
+
+               /* Also free its reserved qgroup space */
+               btrfs_qgroup_free_delayed_ref(root->fs_info,
+                                             head->qgroup_ref_root,
+                                             head->qgroup_reserved);
                return ret;
        }
 
@@ -2433,7 +2495,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                        }
                }
 
+               /*
+                * We need to try and merge add/drops of the same ref since we
+                * can run into issues with relocate dropping the implicit ref
+                * and then it being added back again before the drop can
+                * finish.  If we merged anything we need to re-loop so we can
+                * get a good ref.
+                * Or we can get node references of the same type that weren't
+                * merged when created due to bumps in the tree mod seq, and
+                * we need to merge them to prevent adding an inline extent
+                * backref before dropping it (triggering a BUG_ON at
+                * insert_inline_extent_backref()).
+                */
                spin_lock(&locked_ref->lock);
+               btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+                                        locked_ref);
 
                /*
                 * locked_ref is the head node, so we have to go one
@@ -2828,6 +2904,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_head *head;
        int ret;
        int run_all = count == (unsigned long)-1;
+       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
 
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2844,6 +2921,7 @@ again:
 #ifdef SCRAMBLE_DELAYED_REFS
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
+       trans->can_flush_pending_bgs = false;
        ret = __btrfs_run_delayed_refs(trans, root, count);
        if (ret < 0) {
                btrfs_abort_transaction(trans, root, ret);
@@ -2893,6 +2971,7 @@ again:
        }
 out:
        assert_qgroups_uptodate(trans);
+       trans->can_flush_pending_bgs = can_flush_pending_bgs;
        return 0;
 }
 
@@ -3106,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64, int);
+                           u64, u64, u64, u64, u64, u64);
 
 
        if (btrfs_test_is_dummy_root(root))
@@ -3147,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                          key.offset, 1);
+                                          key.offset);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = root->nodesize;
                        ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0,
-                                          1);
+                                          parent, ref_root, level - 1, 0);
                        if (ret)
                                goto fail;
                }
@@ -3335,6 +3413,15 @@ again:
        }
        spin_unlock(&block_group->lock);
 
+       /*
+        * We hit an ENOSPC when setting up the cache in this transaction, just
+        * skip doing the setup, we've already cleared the cache so we're safe.
+        */
+       if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+               ret = -ENOSPC;
+               goto out_put;
+       }
+
        /*
         * Try to preallocate enough space based on how big the block group is.
         * Keep in mind this has to include any pinned space which could end up
@@ -3348,16 +3435,26 @@ again:
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
 
-       ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+       ret = btrfs_check_data_free_space(inode, 0, num_pages);
        if (ret)
                goto out_put;
 
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                              num_pages, num_pages,
                                              &alloc_hint);
+       /*
+        * Our cache requires contiguous chunks so that we don't modify a bunch
+        * of metadata or split extents when writing the cache out, which means
+        * we can enospc if we are heavily fragmented in addition to just normal
+        * out of space conditions.  So if we hit this just skip setting up any
+        * other block groups for this transaction, maybe we'll unpin enough
+        * space the next time around.
+        */
        if (!ret)
                dcs = BTRFS_DC_SETUP;
-       btrfs_free_reserved_data_space(inode, num_pages);
+       else if (ret == -ENOSPC)
+               set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+       btrfs_free_reserved_data_space(inode, 0, num_pages);
 
 out_put:
        iput(inode);
@@ -3743,6 +3840,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
        found->full = 0;
+       found->max_extent_size = 0;
        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
        found->chunk_alloc = 0;
        found->flush = 0;
@@ -3819,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices = root->fs_info->fs_devices->rw_devices;
        u64 target;
-       u64 tmp;
+       u64 raid_type;
+       u64 allowed = 0;
 
        /*
         * see if restripe for this chunk_type is in progress, if so
@@ -3837,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        spin_unlock(&root->fs_info->balance_lock);
 
        /* First, mask out the RAID levels which aren't possible */
-       if (num_devices == 1)
-               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
-                          BTRFS_BLOCK_GROUP_RAID5);
-       if (num_devices < 3)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID6;
-       if (num_devices < 4)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
-       tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-                      BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
-                      BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
-       flags &= ~tmp;
-
-       if (tmp & BTRFS_BLOCK_GROUP_RAID6)
-               tmp = BTRFS_BLOCK_GROUP_RAID6;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
-               tmp = BTRFS_BLOCK_GROUP_RAID5;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
-               tmp = BTRFS_BLOCK_GROUP_RAID10;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
-               tmp = BTRFS_BLOCK_GROUP_RAID1;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
-               tmp = BTRFS_BLOCK_GROUP_RAID0;
-
-       return extended_to_chunk(flags | tmp);
+       for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+               if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+                       allowed |= btrfs_raid_group[raid_type];
+       }
+       allowed &= flags;
+
+       if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+               allowed = BTRFS_BLOCK_GROUP_RAID6;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+               allowed = BTRFS_BLOCK_GROUP_RAID5;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+               allowed = BTRFS_BLOCK_GROUP_RAID10;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+               allowed = BTRFS_BLOCK_GROUP_RAID1;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+               allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+       flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       return extended_to_chunk(flags | allowed);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3900,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        return ret;
 }
 
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4003,7 +4093,8 @@ commit_trans:
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
                        if (have_pinned_space >= 0 ||
-                           trans->transaction->have_free_bgs ||
+                           test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+                                    &trans->transaction->flags) ||
                            need_commit > 0) {
                                ret = btrfs_commit_transaction(trans, root);
                                if (ret)
@@ -4025,38 +4116,86 @@ commit_trans:
                                              data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
-       ret = btrfs_qgroup_reserve(root, write_bytes);
-       if (ret)
-               goto out;
        data_sinfo->bytes_may_use += bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 1);
-out:
        spin_unlock(&data_sinfo->lock);
 
        return ret;
 }
 
 /*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       /* align the range */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
+
+       ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Use new btrfs_qgroup_reserve_data to reserve precious data space
+        *
+        * TODO: Find a good method to avoid reserve data space for NOCOW
+        * range, but don't impact performance on quota disable case.
+        */
+       ret = btrfs_qgroup_reserve_data(inode, start, len);
+       return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
  */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+                                           u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
 
-       /* make sure bytes are sectorsize aligned */
-       bytes = ALIGN(bytes, root->sectorsize);
+       /* Make sure the range is aligned to sectorsize */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
 
        data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
-       WARN_ON(data_sinfo->bytes_may_use < bytes);
-       data_sinfo->bytes_may_use -= bytes;
+       if (WARN_ON(data_sinfo->bytes_may_use < len))
+               data_sinfo->bytes_may_use = 0;
+       else
+               data_sinfo->bytes_may_use -= len;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
-                                     data_sinfo->flags, bytes, 0);
+                                     data_sinfo->flags, len, 0);
        spin_unlock(&data_sinfo->lock);
 }
 
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+       btrfs_free_reserved_data_space_noquota(inode, start, len);
+       btrfs_qgroup_free_data(inode, start, len);
+}
+
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
        struct list_head *head = &info->space_info;
@@ -4306,7 +4445,8 @@ out:
         * the block groups that were made dirty during the lifetime of the
         * transaction.
         */
-       if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+       if (trans->can_flush_pending_bgs &&
+           trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
                btrfs_create_pending_block_groups(trans, trans->root);
                btrfs_trans_release_chunk_metadata(trans);
        }
@@ -4887,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv(
 {
        struct btrfs_block_rsv *block_rsv = NULL;
 
-       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->csum_root && trans->adding_csums)
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->uuid_root)
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+           (root == root->fs_info->csum_root && trans->adding_csums) ||
+            (root == root->fs_info->uuid_root))
                block_rsv = trans->block_rsv;
 
        if (!block_rsv)
@@ -5336,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (root->fs_info->quota_enabled) {
                /* One for parent inode, two for dir entries */
                num_bytes = 3 * root->nodesize;
-               ret = btrfs_qgroup_reserve(root, num_bytes);
+               ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                if (ret)
                        return ret;
        } else {
@@ -5354,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (ret == -ENOSPC && use_global_rsv)
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
 
-       if (ret) {
-               if (*qgroup_reserved)
-                       btrfs_qgroup_free(root, *qgroup_reserved);
-       }
+       if (ret && *qgroup_reserved)
+               btrfs_qgroup_free_meta(root, *qgroup_reserved);
 
        return ret;
 }
@@ -5518,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        spin_unlock(&BTRFS_I(inode)->lock);
 
        if (root->fs_info->quota_enabled) {
-               ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+               ret = btrfs_qgroup_reserve_meta(root,
+                               nr_extents * root->nodesize);
                if (ret)
                        goto out_fail;
        }
 
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (unlikely(ret)) {
-               if (root->fs_info->quota_enabled)
-                       btrfs_qgroup_free(root, nr_extents * root->nodesize);
+               btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                goto out_fail;
        }
 
@@ -5649,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 }
 
 /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
  * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
  *
  * This will do the following things
  *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
  *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
  * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
  *
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
 {
        int ret;
 
-       ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-       if (ret)
-               return ret;
-
-       ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
-       if (ret) {
-               btrfs_free_reserved_data_space(inode, num_bytes);
+       ret = btrfs_check_data_free_space(inode, start, len);
+       if (ret < 0)
                return ret;
-       }
-
-       return 0;
+       ret = btrfs_delalloc_reserve_metadata(inode, len);
+       if (ret < 0)
+               btrfs_free_reserved_data_space(inode, start, len);
+       return ret;
 }
 
 /**
  * btrfs_delalloc_release_space - release data and metadata space for delalloc
  * @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
  *
  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
  * called in the case that we don't need the metadata AND data reservations
@@ -5692,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
  * This function will release the metadata space that was not used and will
  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
  * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
  */
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
 {
-       btrfs_delalloc_release_metadata(inode, num_bytes);
-       btrfs_free_reserved_data_space(inode, num_bytes);
+       btrfs_delalloc_release_metadata(inode, len);
+       btrfs_free_reserved_data_space(inode, start, len);
 }
 
 static int update_block_group(struct btrfs_trans_handle *trans,
@@ -6061,6 +6203,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        update_global_block_rsv(fs_info);
 }
 
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+                  u64 *empty_cluster)
+{
+       struct btrfs_free_cluster *ret = NULL;
+       bool ssd = btrfs_test_opt(root, SSD);
+
+       *empty_cluster = 0;
+       if (btrfs_mixed_space_info(space_info))
+               return ret;
+
+       if (ssd)
+               *empty_cluster = 2 * 1024 * 1024;
+       if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+               ret = &root->fs_info->meta_alloc_cluster;
+               if (!ssd)
+                       *empty_cluster = 64 * 1024;
+       } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+               ret = &root->fs_info->data_alloc_cluster;
+       }
+
+       return ret;
+}
+
 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                              const bool return_free_space)
 {
@@ -6068,7 +6238,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_space_info *space_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       struct btrfs_free_cluster *cluster = NULL;
        u64 len;
+       u64 total_unpinned = 0;
+       u64 empty_cluster = 0;
        bool readonly;
 
        while (start <= end) {
@@ -6077,8 +6250,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
                                btrfs_put_block_group(cache);
+                       total_unpinned = 0;
                        cache = btrfs_lookup_block_group(fs_info, start);
                        BUG_ON(!cache); /* Logic error */
+
+                       cluster = fetch_cluster_info(root,
+                                                    cache->space_info,
+                                                    &empty_cluster);
+                       empty_cluster <<= 1;
                }
 
                len = cache->key.objectid + cache->key.offset - start;
@@ -6091,12 +6270,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                }
 
                start += len;
+               total_unpinned += len;
                space_info = cache->space_info;
 
+               /*
+                * If this space cluster has been marked as fragmented and we've
+                * unpinned enough in this block group to potentially allow a
+                * cluster to be created inside of it go ahead and clear the
+                * fragmented check.
+                */
+               if (cluster && cluster->fragmented &&
+                   total_unpinned > empty_cluster) {
+                       spin_lock(&cluster->lock);
+                       cluster->fragmented = 0;
+                       spin_unlock(&cluster->lock);
+               }
+
                spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                space_info->bytes_pinned -= len;
+               space_info->max_extent_size = 0;
                percpu_counter_add(&space_info->total_bytes_pinned, -len);
                if (cache->ro) {
                        space_info->bytes_readonly += len;
@@ -6229,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        int extent_slot = 0;
        int found_extent = 0;
        int num_to_del = 1;
-       int no_quota = node->no_quota;
        u32 item_size;
        u64 refs;
        u64 bytenr = node->bytenr;
@@ -6238,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                 SKINNY_METADATA);
 
-       if (!info->quota_enabled || !is_fstree(root_objectid))
-               no_quota = 1;
-
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -6566,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                                        buf->start, buf->len,
                                        parent, root->root_key.objectid,
                                        btrfs_header_level(buf),
-                                       BTRFS_DROP_DELAYED_REF, NULL, 0);
+                                       BTRFS_DROP_DELAYED_REF, NULL);
                BUG_ON(ret); /* -ENOMEM */
        }
 
@@ -6614,7 +6804,7 @@ out:
 /* Can return -ENOMEM */
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                     u64 owner, u64 offset, int no_quota)
+                     u64 owner, u64 offset)
 {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6637,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                        num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+                                       BTRFS_DROP_DELAYED_REF, NULL);
        } else {
                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
                                                num_bytes,
                                                parent, root_objectid, owner,
-                                               offset, BTRFS_DROP_DELAYED_REF,
-                                               NULL, no_quota);
+                                               offset, 0,
+                                               BTRFS_DROP_DELAYED_REF, NULL);
        }
        return ret;
 }
@@ -6829,7 +7019,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        struct btrfs_block_group_cache *block_group = NULL;
        u64 search_start = 0;
        u64 max_extent_size = 0;
-       int empty_cluster = 2 * 1024 * 1024;
+       u64 empty_cluster = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = __get_raid_index(flags);
@@ -6839,6 +7029,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        bool failed_alloc = false;
        bool use_cluster = true;
        bool have_caching_bg = false;
+       bool full_search = false;
 
        WARN_ON(num_bytes < root->sectorsize);
        ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6854,36 +7045,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        }
 
        /*
-        * If the space info is for both data and metadata it means we have a
-        * small filesystem and we can't use the clustering stuff.
+        * If our free space is heavily fragmented we may not be able to make
+        * big contiguous allocations, so instead of doing the expensive search
+        * for free space, simply return ENOSPC with our max_extent_size so we
+        * can go ahead and search for a more manageable chunk.
+        *
+        * If our max_extent_size is large enough for our allocation simply
+        * disable clustering since we will likely not be able to find enough
+        * space to create a cluster and induce latency trying.
         */
-       if (btrfs_mixed_space_info(space_info))
-               use_cluster = false;
-
-       if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-               last_ptr = &root->fs_info->meta_alloc_cluster;
-               if (!btrfs_test_opt(root, SSD))
-                       empty_cluster = 64 * 1024;
-       }
-
-       if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
-           btrfs_test_opt(root, SSD)) {
-               last_ptr = &root->fs_info->data_alloc_cluster;
+       if (unlikely(space_info->max_extent_size)) {
+               spin_lock(&space_info->lock);
+               if (space_info->max_extent_size &&
+                   num_bytes > space_info->max_extent_size) {
+                       ins->offset = space_info->max_extent_size;
+                       spin_unlock(&space_info->lock);
+                       return -ENOSPC;
+               } else if (space_info->max_extent_size) {
+                       use_cluster = false;
+               }
+               spin_unlock(&space_info->lock);
        }
 
+       last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
        if (last_ptr) {
                spin_lock(&last_ptr->lock);
                if (last_ptr->block_group)
                        hint_byte = last_ptr->window_start;
+               if (last_ptr->fragmented) {
+                       /*
+                        * We still set window_start so we can keep track of the
+                        * last place we found an allocation to try and save
+                        * some time.
+                        */
+                       hint_byte = last_ptr->window_start;
+                       use_cluster = false;
+               }
                spin_unlock(&last_ptr->lock);
        }
 
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-
-       if (!last_ptr)
-               empty_cluster = 0;
-
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
@@ -6918,6 +7120,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        }
 search:
        have_caching_bg = false;
+       if (index == 0 || index == __get_raid_index(flags))
+               full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
                            list) {
@@ -6951,6 +7155,7 @@ search:
 have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
+                       have_caching_bg = true;
                        ret = cache_block_group(block_group, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
@@ -6965,7 +7170,7 @@ have_block_group:
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
                 */
-               if (last_ptr) {
+               if (last_ptr && use_cluster) {
                        struct btrfs_block_group_cache *used_block_group;
                        unsigned long aligned_cluster;
                        /*
@@ -7091,6 +7296,16 @@ refill_cluster:
                }
 
 unclustered_alloc:
+               /*
+                * We are doing an unclustered alloc, set the fragmented flag so
+                * we don't bother trying to setup a cluster again until we get
+                * more space.
+                */
+               if (unlikely(last_ptr)) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->fragmented = 1;
+                       spin_unlock(&last_ptr->lock);
+               }
                spin_lock(&block_group->free_space_ctl->tree_lock);
                if (cached &&
                    block_group->free_space_ctl->free_space <
@@ -7123,8 +7338,6 @@ unclustered_alloc:
                        failed_alloc = true;
                        goto have_block_group;
                } else if (!offset) {
-                       if (!cached)
-                               have_caching_bg = true;
                        goto loop;
                }
 checks:
@@ -7181,7 +7394,20 @@ loop:
         */
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                index = 0;
-               loop++;
+               if (loop == LOOP_CACHING_NOWAIT) {
+                       /*
+                        * We want to skip the LOOP_CACHING_WAIT step if we
+                        * don't have any unached bgs and we've alrelady done a
+                        * full search through.
+                        */
+                       if (have_caching_bg || !full_search)
+                               loop = LOOP_CACHING_WAIT;
+                       else
+                               loop = LOOP_ALLOC_CHUNK;
+               } else {
+                       loop++;
+               }
+
                if (loop == LOOP_ALLOC_CHUNK) {
                        struct btrfs_trans_handle *trans;
                        int exist = 0;
@@ -7199,6 +7425,15 @@ loop:
 
                        ret = do_chunk_alloc(trans, root, flags,
                                             CHUNK_ALLOC_FORCE);
+
+                       /*
+                        * If we can't allocate a new chunk we've already looped
+                        * through at least once, move on to the NO_EMPTY_SIZE
+                        * case.
+                        */
+                       if (ret == -ENOSPC)
+                               loop = LOOP_NO_EMPTY_SIZE;
+
                        /*
                         * Do not bail out on ENOSPC since we
                         * can do more things.
@@ -7215,6 +7450,15 @@ loop:
                }
 
                if (loop == LOOP_NO_EMPTY_SIZE) {
+                       /*
+                        * Don't loop again if we already have no empty_size and
+                        * no empty_cluster.
+                        */
+                       if (empty_size == 0 &&
+                           empty_cluster == 0) {
+                               ret = -ENOSPC;
+                               goto out;
+                       }
                        empty_size = 0;
                        empty_cluster = 0;
                }
@@ -7223,11 +7467,20 @@ loop:
        } else if (!ins->objectid) {
                ret = -ENOSPC;
        } else if (ins->objectid) {
+               if (!use_cluster && last_ptr) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->window_start = ins->objectid;
+                       spin_unlock(&last_ptr->lock);
+               }
                ret = 0;
        }
 out:
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
+               spin_lock(&space_info->lock);
+               space_info->max_extent_size = max_extent_size;
+               spin_unlock(&space_info->lock);
                ins->offset = max_extent_size;
+       }
        return ret;
 }
 
@@ -7276,7 +7529,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
                         u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc)
 {
-       bool final_tried = false;
+       bool final_tried = num_bytes == min_alloc_size;
        u64 flags;
        int ret;
 
@@ -7425,8 +7678,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     u64 parent, u64 root_objectid,
                                     u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins,
-                                    int no_quota)
+                                    int level, struct btrfs_key *ins)
 {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7507,7 +7759,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     u64 root_objectid, u64 owner,
-                                    u64 offset, struct btrfs_key *ins)
+                                    u64 offset, u64 ram_bytes,
+                                    struct btrfs_key *ins)
 {
        int ret;
 
@@ -7516,7 +7769,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
                                         ins->offset, 0,
                                         root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+                                        ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+                                        NULL);
        return ret;
 }
 
@@ -7730,7 +7984,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                                                 ins.objectid, ins.offset,
                                                 parent, root_objectid, level,
                                                 BTRFS_ADD_DELAYED_EXTENT,
-                                                extent_op, 0);
+                                                extent_op);
                if (ret)
                        goto out_free_delayed;
        }
@@ -8271,14 +8525,15 @@ skip:
                        ret = account_shared_subtree(trans, root, next,
                                                     generation, level - 1);
                        if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "Error "
                                        "%d accounting shared subtree. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                        }
                }
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                               root->root_key.objectid, level - 1, 0, 0);
+                               root->root_key.objectid, level - 1, 0);
                BUG_ON(ret); /* -ENOMEM */
        }
        btrfs_tree_unlock(next);
@@ -8363,10 +8618,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        BUG_ON(ret); /* -ENOMEM */
                        ret = account_leaf_items(trans, root, eb);
                        if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "error "
                                        "%d accounting leaf items. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                        }
                }
                /* make block locked assertion in clean_tree_block happy */
@@ -8688,7 +8944,7 @@ out:
        if (!for_reloc && root_dropped == false)
                btrfs_add_dead_root(root);
        if (err && err != -EAGAIN)
-               btrfs_std_error(root->fs_info, err);
+               btrfs_std_error(root->fs_info, err, NULL);
        return err;
 }
 
@@ -8876,7 +9132,7 @@ again:
         * back off and let this transaction commit
         */
        mutex_lock(&root->fs_info->ro_block_group_mutex);
-       if (trans->transaction->dirty_bg_run) {
+       if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                u64 transid = trans->transid;
 
                mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -9560,7 +9816,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_item item;
        struct btrfs_key key;
        int ret = 0;
+       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
 
+       trans->can_flush_pending_bgs = false;
        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
                if (ret)
                        goto next;
@@ -9581,6 +9839,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 next:
                list_del_init(&block_group->bg_list);
        }
+       trans->can_flush_pending_bgs = can_flush_pending_bgs;
 }
 
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -9623,6 +9882,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
        free_excluded_extents(root, cache);
 
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(root, cache)) {
+               u64 new_bytes_used = size - bytes_used;
+
+               bytes_used += new_bytes_used >> 1;
+               fragment_free_space(root, cache);
+       }
+#endif
        /*
         * Call to ensure the corresponding space_info object is created and
         * assigned to our block group, but don't update its counters just yet.
@@ -10363,8 +10630,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
 {
        percpu_counter_dec(&root->subv_writers->counter);
        /*
-        * Make sure counter is updated before we wake up
-        * waiters.
+        * Make sure counter is updated before we wake up waiters.
         */
        smp_mb();
        if (waitqueue_active(&root->subv_writers->wait))