Merge branch 'allocator-fixes' into for-linus-4.4
authorChris Mason <clm@fb.com>
Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
committerChris Mason <clm@fb.com>
Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
Signed-off-by: Chris Mason <clm@fb.com>
1  2 
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h

diff --combined fs/btrfs/ctree.h
@@@ -1154,6 -1154,10 +1154,10 @@@ struct btrfs_space_info 
                                   delalloc/allocations */
        u64 bytes_readonly;     /* total bytes that are read only */
  
+       u64 max_extent_size;    /* This will hold the maximum extent size of
+                                  the space info if we had an ENOSPC in the
+                                  allocator. */
        unsigned int full:1;    /* indicates that we cannot allocate any more
                                   chunks for this space */
        unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
@@@ -1228,6 -1232,9 +1232,9 @@@ struct btrfs_free_cluster 
        /* first extent starting offset */
        u64 window_start;
  
+       /* We did a full search and couldn't create a cluster */
+       bool fragmented;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@@ -1943,9 -1950,6 +1950,9 @@@ struct btrfs_root 
        int send_in_progress;
        struct btrfs_subvolume_writers *subv_writers;
        atomic_t will_be_snapshoted;
 +
 +      /* For qgroup metadata space reserve */
 +      atomic_t qgroup_meta_rsv;
  };
  
  struct btrfs_ioctl_defrag_range_args {
  #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
  #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR      (1 << 22)
  #define BTRFS_MOUNT_RESCAN_UUID_TREE  (1 << 23)
+ #define BTRFS_MOUNT_FRAGMENT_DATA     (1 << 24)
+ #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
  
  #define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
  #define BTRFS_DEFAULT_MAX_INLINE      (8192)
        btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
  }
  
+ #ifdef CONFIG_BTRFS_DEBUG
+ static inline int
+ btrfs_should_fragment_free_space(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *block_group)
+ {
+       return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+               block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+              (btrfs_test_opt(root, FRAGMENT_DATA) &&
+               block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+ }
+ #endif
  /*
   * Requests for changes that need to be done during transaction commit.
   *
@@@ -3452,11 -3470,8 +3473,11 @@@ enum btrfs_reserve_flush_enum 
        BTRFS_RESERVE_FLUSH_ALL,
  };
  
 -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
 -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 +                                          u64 len);
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@@ -3472,8 -3487,8 +3493,8 @@@ void btrfs_subvolume_release_metadata(s
                                      u64 qgroup_reserved);
  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
 +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
                                              unsigned short type);
diff --combined fs/btrfs/disk-io.c
@@@ -1267,7 -1267,6 +1267,7 @@@ static void __setup_root(u32 nodesize, 
        atomic_set(&root->orphan_inodes, 0);
        atomic_set(&root->refs, 1);
        atomic_set(&root->will_be_snapshoted, 0);
 +      atomic_set(&root->qgroup_meta_rsv, 0);
        root->log_transid = 0;
        root->log_transid_committed = -1;
        root->last_log_commit = 0;
@@@ -4327,25 -4326,6 +4327,6 @@@ again
        return 0;
  }
  
- static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
-                                      struct btrfs_fs_info *fs_info)
- {
-       struct btrfs_ordered_extent *ordered;
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
- }
  void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_root *root)
  {
        cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&root->fs_info->transaction_wait);
  
-       btrfs_free_pending_ordered(cur_trans, root->fs_info);
        btrfs_destroy_delayed_inodes(root);
        btrfs_assert_delayed_root_empty(root);
  
diff --combined fs/btrfs/extent-tree.c
@@@ -332,6 -332,27 +332,27 @@@ static void put_caching_control(struct 
                kfree(ctl);
  }
  
+ #ifdef CONFIG_BTRFS_DEBUG
+ static void fragment_free_space(struct btrfs_root *root,
+                               struct btrfs_block_group_cache *block_group)
+ {
+       u64 start = block_group->key.objectid;
+       u64 len = block_group->key.offset;
+       u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+               root->nodesize : root->sectorsize;
+       u64 step = chunk << 1;
+       while (len > chunk) {
+               btrfs_remove_free_space(block_group, start, chunk);
+               start += step;
+               if (len < step)
+                       len = 0;
+               else
+                       len -= step;
+       }
+ }
+ #endif
  /*
   * this is only called by cache_block_group, since we could have freed extents
   * we need to check the pinned_extents for any extents that can't be used yet
@@@ -388,6 -409,7 +409,7 @@@ static noinline void caching_thread(str
        u64 last = 0;
        u32 nritems;
        int ret = -ENOMEM;
+       bool wakeup = true;
  
        caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
  
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  
+ #ifdef CONFIG_BTRFS_DEBUG
+       /*
+        * If we're fragmenting we don't want to make anybody think we can
+        * allocate from this block group until we've had a chance to fragment
+        * the free space.
+        */
+       if (btrfs_should_fragment_free_space(extent_root, block_group))
+               wakeup = false;
+ #endif
        /*
         * We don't want to deadlock with somebody trying to allocate a new
         * extent for the extent root while also trying to search the extent
@@@ -441,7 -472,8 +472,8 @@@ next
  
                        if (need_resched() ||
                            rwsem_is_contended(&fs_info->commit_root_sem)) {
-                               caching_ctl->progress = last;
+                               if (wakeup)
+                                       caching_ctl->progress = last;
                                btrfs_release_path(path);
                                up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
                        key.offset = 0;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
  
-                       caching_ctl->progress = last;
+                       if (wakeup)
+                               caching_ctl->progress = last;
                        btrfs_release_path(path);
                        goto next;
                }
  
                        if (total_found > (1024 * 1024 * 2)) {
                                total_found = 0;
-                               wake_up(&caching_ctl->wait);
+                               if (wakeup)
+                                       wake_up(&caching_ctl->wait);
                        }
                }
                path->slots[0]++;
        total_found += add_new_free_space(block_group, fs_info, last,
                                          block_group->key.objectid +
                                          block_group->key.offset);
-       caching_ctl->progress = (u64)-1;
        spin_lock(&block_group->lock);
        block_group->caching_ctl = NULL;
        block_group->cached = BTRFS_CACHE_FINISHED;
        spin_unlock(&block_group->lock);
  
+ #ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+               u64 bytes_used;
+               spin_lock(&block_group->space_info->lock);
+               spin_lock(&block_group->lock);
+               bytes_used = block_group->key.offset -
+                       btrfs_block_group_used(&block_group->item);
+               block_group->space_info->bytes_used += bytes_used >> 1;
+               spin_unlock(&block_group->lock);
+               spin_unlock(&block_group->space_info->lock);
+               fragment_free_space(extent_root, block_group);
+       }
+ #endif
+       caching_ctl->progress = (u64)-1;
  err:
        btrfs_free_path(path);
        up_read(&fs_info->commit_root_sem);
@@@ -607,6 -655,22 +655,22 @@@ static int cache_block_group(struct btr
                        }
                }
                spin_unlock(&cache->lock);
+ #ifdef CONFIG_BTRFS_DEBUG
+               if (ret == 1 &&
+                   btrfs_should_fragment_free_space(fs_info->extent_root,
+                                                    cache)) {
+                       u64 bytes_used;
+                       spin_lock(&cache->space_info->lock);
+                       spin_lock(&cache->lock);
+                       bytes_used = cache->key.offset -
+                               btrfs_block_group_used(&cache->item);
+                       cache->space_info->bytes_used += bytes_used >> 1;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
+                       fragment_free_space(fs_info->extent_root, cache);
+               }
+ #endif
                mutex_unlock(&caching_ctl->mutex);
  
                wake_up(&caching_ctl->wait);
@@@ -2345,11 -2409,6 +2409,11 @@@ static int run_one_delayed_ref(struct b
                                                      node->num_bytes);
                        }
                }
 +
 +              /* Also free its reserved qgroup space */
 +              btrfs_qgroup_free_delayed_ref(root->fs_info,
 +                                            head->qgroup_ref_root,
 +                                            head->qgroup_reserved);
                return ret;
        }
  
@@@ -3343,6 -3402,15 +3407,15 @@@ again
        }
        spin_unlock(&block_group->lock);
  
+       /*
+        * We hit an ENOSPC when setting up the cache in this transaction, just
+        * skip doing the setup, we've already cleared the cache so we're safe.
+        */
+       if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+               ret = -ENOSPC;
+               goto out_put;
+       }
        /*
         * Try to preallocate enough space based on how big the block group is.
         * Keep in mind this has to include any pinned space which could end up
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
  
 -      ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
 +      ret = btrfs_check_data_free_space(inode, 0, num_pages);
        if (ret)
                goto out_put;
  
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                              num_pages, num_pages,
                                              &alloc_hint);
+       /*
+        * Our cache requires contiguous chunks so that we don't modify a bunch
+        * of metadata or split extents when writing the cache out, which means
+        * we can enospc if we are heavily fragmented in addition to just normal
+        * out of space conditions.  So if we hit this just skip setting up any
+        * other block groups for this transaction, maybe we'll unpin enough
+        * space the next time around.
+        */
        if (!ret)
                dcs = BTRFS_DC_SETUP;
 -      btrfs_free_reserved_data_space(inode, num_pages);
+       else if (ret == -ENOSPC)
+               set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
 +      btrfs_free_reserved_data_space(inode, 0, num_pages);
  
  out_put:
        iput(inode);
@@@ -3751,6 -3829,7 +3834,7 @@@ static int update_space_info(struct btr
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
        found->full = 0;
+       found->max_extent_size = 0;
        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
        found->chunk_alloc = 0;
        found->flush = 0;
@@@ -3904,7 -3983,11 +3988,7 @@@ u64 btrfs_get_alloc_profile(struct btrf
        return ret;
  }
  
 -/*
 - * This will check the space that the inode allocates from to make sure we have
 - * enough space for bytes.
 - */
 -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
 +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
  {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -4003,7 -4086,8 +4087,8 @@@ commit_trans
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
                        if (have_pinned_space >= 0 ||
-                           trans->transaction->have_free_bgs ||
+                           test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+                                    &trans->transaction->flags) ||
                            need_commit > 0) {
                                ret = btrfs_commit_transaction(trans, root);
                                if (ret)
                                              data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
 -      ret = btrfs_qgroup_reserve(root, write_bytes);
 -      if (ret)
 -              goto out;
        data_sinfo->bytes_may_use += bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 1);
 -out:
        spin_unlock(&data_sinfo->lock);
  
        return ret;
  }
  
  /*
 - * Called if we need to clear a data reservation for this inode.
 + * New check_data_free_space() with ability for precious data reservation
 + * Will replace old btrfs_check_data_free_space(), but for patch split,
 + * add a new function first and then replace it.
 + */
 +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 +{
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      int ret;
 +
 +      /* align the range */
 +      len = round_up(start + len, root->sectorsize) -
 +            round_down(start, root->sectorsize);
 +      start = round_down(start, root->sectorsize);
 +
 +      ret = btrfs_alloc_data_chunk_ondemand(inode, len);
 +      if (ret < 0)
 +              return ret;
 +
 +      /*
 +       * Use new btrfs_qgroup_reserve_data to reserve precious data space
 +       *
 +       * TODO: Find a good method to avoid reserve data space for NOCOW
 +       * range, but don't impact performance on quota disable case.
 +       */
 +      ret = btrfs_qgroup_reserve_data(inode, start, len);
 +      return ret;
 +}
 +
 +/*
 + * Called if we need to clear a data reservation for this inode
 + * Normally in a error case.
 + *
 + * This one will *NOT* use accurate qgroup reserved space API, just for case
 + * which we can't sleep and is sure it won't affect qgroup reserved space.
 + * Like clear_bit_hook().
   */
 -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 +                                          u64 len)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
  
 -      /* make sure bytes are sectorsize aligned */
 -      bytes = ALIGN(bytes, root->sectorsize);
 +      /* Make sure the range is aligned to sectorsize */
 +      len = round_up(start + len, root->sectorsize) -
 +            round_down(start, root->sectorsize);
 +      start = round_down(start, root->sectorsize);
  
        data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
 -      WARN_ON(data_sinfo->bytes_may_use < bytes);
 -      data_sinfo->bytes_may_use -= bytes;
 +      if (WARN_ON(data_sinfo->bytes_may_use < len))
 +              data_sinfo->bytes_may_use = 0;
 +      else
 +              data_sinfo->bytes_may_use -= len;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
 -                                    data_sinfo->flags, bytes, 0);
 +                                    data_sinfo->flags, len, 0);
        spin_unlock(&data_sinfo->lock);
  }
  
 +/*
 + * Called if we need to clear a data reservation for this inode
 + * Normally in a error case.
 + *
 + * This one will handle the per-indoe data rsv map for accurate reserved
 + * space framework.
 + */
 +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 +{
 +      btrfs_free_reserved_data_space_noquota(inode, start, len);
 +      btrfs_qgroup_free_data(inode, start, len);
 +}
 +
  static void force_metadata_allocation(struct btrfs_fs_info *info)
  {
        struct list_head *head = &info->space_info;
@@@ -5381,7 -5417,7 +5466,7 @@@ int btrfs_subvolume_reserve_metadata(st
        if (root->fs_info->quota_enabled) {
                /* One for parent inode, two for dir entries */
                num_bytes = 3 * root->nodesize;
 -              ret = btrfs_qgroup_reserve(root, num_bytes);
 +              ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                if (ret)
                        return ret;
        } else {
        if (ret == -ENOSPC && use_global_rsv)
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
  
 -      if (ret) {
 -              if (*qgroup_reserved)
 -                      btrfs_qgroup_free(root, *qgroup_reserved);
 -      }
 +      if (ret && *qgroup_reserved)
 +              btrfs_qgroup_free_meta(root, *qgroup_reserved);
  
        return ret;
  }
@@@ -5561,15 -5599,15 +5646,15 @@@ int btrfs_delalloc_reserve_metadata(str
        spin_unlock(&BTRFS_I(inode)->lock);
  
        if (root->fs_info->quota_enabled) {
 -              ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
 +              ret = btrfs_qgroup_reserve_meta(root,
 +                              nr_extents * root->nodesize);
                if (ret)
                        goto out_fail;
        }
  
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (unlikely(ret)) {
 -              if (root->fs_info->quota_enabled)
 -                      btrfs_qgroup_free(root, nr_extents * root->nodesize);
 +              btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                goto out_fail;
        }
  
@@@ -5692,48 -5730,41 +5777,48 @@@ void btrfs_delalloc_release_metadata(st
  }
  
  /**
 - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
 + * btrfs_delalloc_reserve_space - reserve data and metadata space for
 + * delalloc
   * @inode: inode we're writing to
 - * @num_bytes: the number of bytes we want to allocate
 + * @start: start range we are writing to
 + * @len: how long the range we are writing to
 + *
 + * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
   *
   * This will do the following things
   *
 - * o reserve space in the data space info for num_bytes
 - * o reserve space in the metadata space info based on number of outstanding
 + * o reserve space in data space info for num bytes
 + *   and reserve precious corresponding qgroup space
 + *   (Done in check_data_free_space)
 + *
 + * o reserve space for metadata space, based on the number of outstanding
   *   extents and how much csums will be needed
 - * o add to the inodes ->delalloc_bytes
 + *   also reserve metadata space in a per root over-reserve method.
 + * o add to the inodes->delalloc_bytes
   * o add it to the fs_info's delalloc inodes list.
 + *   (Above 3 all done in delalloc_reserve_metadata)
   *
 - * This will return 0 for success and -ENOSPC if there is no space left.
 + * Return 0 for success
 + * Return <0 for error(-ENOSPC or -EQUOT)
   */
 -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
  {
        int ret;
  
 -      ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
 -      if (ret)
 -              return ret;
 -
 -      ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
 -      if (ret) {
 -              btrfs_free_reserved_data_space(inode, num_bytes);
 +      ret = btrfs_check_data_free_space(inode, start, len);
 +      if (ret < 0)
                return ret;
 -      }
 -
 -      return 0;
 +      ret = btrfs_delalloc_reserve_metadata(inode, len);
 +      if (ret < 0)
 +              btrfs_free_reserved_data_space(inode, start, len);
 +      return ret;
  }
  
  /**
   * btrfs_delalloc_release_space - release data and metadata space for delalloc
   * @inode: inode we're releasing space for
 - * @num_bytes: the number of bytes we want to free up
 + * @start: start position of the space already reserved
 + * @len: the len of the space already reserved
   *
   * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
   * called in the case that we don't need the metadata AND data reservations
   * This function will release the metadata space that was not used and will
   * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
   * list if there are no delalloc bytes left.
 + * Also it will handle the qgroup reserved space.
   */
 -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
  {
 -      btrfs_delalloc_release_metadata(inode, num_bytes);
 -      btrfs_free_reserved_data_space(inode, num_bytes);
 +      btrfs_delalloc_release_metadata(inode, len);
 +      btrfs_free_reserved_data_space(inode, start, len);
  }
  
  static int update_block_group(struct btrfs_trans_handle *trans,
@@@ -6112,6 -6142,34 +6197,34 @@@ void btrfs_prepare_extent_commit(struc
        update_global_block_rsv(fs_info);
  }
  
+ /*
+  * Returns the free cluster for the given space info and sets empty_cluster to
+  * what it should be based on the mount options.
+  */
+ static struct btrfs_free_cluster *
+ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+                  u64 *empty_cluster)
+ {
+       struct btrfs_free_cluster *ret = NULL;
+       bool ssd = btrfs_test_opt(root, SSD);
+       *empty_cluster = 0;
+       if (btrfs_mixed_space_info(space_info))
+               return ret;
+       if (ssd)
+               *empty_cluster = 2 * 1024 * 1024;
+       if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+               ret = &root->fs_info->meta_alloc_cluster;
+               if (!ssd)
+                       *empty_cluster = 64 * 1024;
+       } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+               ret = &root->fs_info->data_alloc_cluster;
+       }
+       return ret;
+ }
  static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                              const bool return_free_space)
  {
        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_space_info *space_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       struct btrfs_free_cluster *cluster = NULL;
        u64 len;
+       u64 total_unpinned = 0;
+       u64 empty_cluster = 0;
        bool readonly;
  
        while (start <= end) {
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
                                btrfs_put_block_group(cache);
+                       total_unpinned = 0;
                        cache = btrfs_lookup_block_group(fs_info, start);
                        BUG_ON(!cache); /* Logic error */
+                       cluster = fetch_cluster_info(root,
+                                                    cache->space_info,
+                                                    &empty_cluster);
+                       empty_cluster <<= 1;
                }
  
                len = cache->key.objectid + cache->key.offset - start;
                }
  
                start += len;
+               total_unpinned += len;
                space_info = cache->space_info;
  
+               /*
+                * If this space cluster has been marked as fragmented and we've
+                * unpinned enough in this block group to potentially allow a
+                * cluster to be created inside of it go ahead and clear the
+                * fragmented check.
+                */
+               if (cluster && cluster->fragmented &&
+                   total_unpinned > empty_cluster) {
+                       spin_lock(&cluster->lock);
+                       cluster->fragmented = 0;
+                       spin_unlock(&cluster->lock);
+               }
                spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                space_info->bytes_pinned -= len;
+               space_info->max_extent_size = 0;
                percpu_counter_add(&space_info->total_bytes_pinned, -len);
                if (cache->ro) {
                        space_info->bytes_readonly += len;
@@@ -6880,7 -6962,7 +7017,7 @@@ static noinline int find_free_extent(st
        struct btrfs_block_group_cache *block_group = NULL;
        u64 search_start = 0;
        u64 max_extent_size = 0;
-       int empty_cluster = 2 * 1024 * 1024;
+       u64 empty_cluster = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = __get_raid_index(flags);
        bool failed_alloc = false;
        bool use_cluster = true;
        bool have_caching_bg = false;
+       bool full_search = false;
  
        WARN_ON(num_bytes < root->sectorsize);
        ins->type = BTRFS_EXTENT_ITEM_KEY;
        }
  
        /*
-        * If the space info is for both data and metadata it means we have a
-        * small filesystem and we can't use the clustering stuff.
+        * If our free space is heavily fragmented we may not be able to make
+        * big contiguous allocations, so instead of doing the expensive search
+        * for free space, simply return ENOSPC with our max_extent_size so we
+        * can go ahead and search for a more manageable chunk.
+        *
+        * If our max_extent_size is large enough for our allocation simply
+        * disable clustering since we will likely not be able to find enough
+        * space to create a cluster and induce latency trying.
         */
-       if (btrfs_mixed_space_info(space_info))
-               use_cluster = false;
-       if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-               last_ptr = &root->fs_info->meta_alloc_cluster;
-               if (!btrfs_test_opt(root, SSD))
-                       empty_cluster = 64 * 1024;
-       }
-       if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
-           btrfs_test_opt(root, SSD)) {
-               last_ptr = &root->fs_info->data_alloc_cluster;
+       if (unlikely(space_info->max_extent_size)) {
+               spin_lock(&space_info->lock);
+               if (space_info->max_extent_size &&
+                   num_bytes > space_info->max_extent_size) {
+                       ins->offset = space_info->max_extent_size;
+                       spin_unlock(&space_info->lock);
+                       return -ENOSPC;
+               } else if (space_info->max_extent_size) {
+                       use_cluster = false;
+               }
+               spin_unlock(&space_info->lock);
        }
  
+       last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
        if (last_ptr) {
                spin_lock(&last_ptr->lock);
                if (last_ptr->block_group)
                        hint_byte = last_ptr->window_start;
+               if (last_ptr->fragmented) {
+                       /*
+                        * We still set window_start so we can keep track of the
+                        * last place we found an allocation to try and save
+                        * some time.
+                        */
+                       hint_byte = last_ptr->window_start;
+                       use_cluster = false;
+               }
                spin_unlock(&last_ptr->lock);
        }
  
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-       if (!last_ptr)
-               empty_cluster = 0;
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
        }
  search:
        have_caching_bg = false;
+       if (index == 0 || index == __get_raid_index(flags))
+               full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
                            list) {
  have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
+                       have_caching_bg = true;
                        ret = cache_block_group(block_group, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
                 */
-               if (last_ptr) {
+               if (last_ptr && use_cluster) {
                        struct btrfs_block_group_cache *used_block_group;
                        unsigned long aligned_cluster;
                        /*
@@@ -7142,6 -7239,16 +7294,16 @@@ refill_cluster
                }
  
  unclustered_alloc:
+               /*
+                * We are doing an unclustered alloc, set the fragmented flag so
+                * we don't bother trying to setup a cluster again until we get
+                * more space.
+                */
+               if (unlikely(last_ptr)) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->fragmented = 1;
+                       spin_unlock(&last_ptr->lock);
+               }
                spin_lock(&block_group->free_space_ctl->tree_lock);
                if (cached &&
                    block_group->free_space_ctl->free_space <
                        failed_alloc = true;
                        goto have_block_group;
                } else if (!offset) {
-                       if (!cached)
-                               have_caching_bg = true;
                        goto loop;
                }
  checks:
@@@ -7232,7 -7337,20 +7392,20 @@@ loop
         */
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                index = 0;
-               loop++;
+               if (loop == LOOP_CACHING_NOWAIT) {
+                       /*
+                        * We want to skip the LOOP_CACHING_WAIT step if we
+                        * don't have any unached bgs and we've alrelady done a
+                        * full search through.
+                        */
+                       if (have_caching_bg || !full_search)
+                               loop = LOOP_CACHING_WAIT;
+                       else
+                               loop = LOOP_ALLOC_CHUNK;
+               } else {
+                       loop++;
+               }
                if (loop == LOOP_ALLOC_CHUNK) {
                        struct btrfs_trans_handle *trans;
                        int exist = 0;
  
                        ret = do_chunk_alloc(trans, root, flags,
                                             CHUNK_ALLOC_FORCE);
+                       /*
+                        * If we can't allocate a new chunk we've already looped
+                        * through at least once, move on to the NO_EMPTY_SIZE
+                        * case.
+                        */
+                       if (ret == -ENOSPC)
+                               loop = LOOP_NO_EMPTY_SIZE;
                        /*
                         * Do not bail out on ENOSPC since we
                         * can do more things.
                }
  
                if (loop == LOOP_NO_EMPTY_SIZE) {
+                       /*
+                        * Don't loop again if we already have no empty_size and
+                        * no empty_cluster.
+                        */
+                       if (empty_size == 0 &&
+                           empty_cluster == 0) {
+                               ret = -ENOSPC;
+                               goto out;
+                       }
                        empty_size = 0;
                        empty_cluster = 0;
                }
        } else if (!ins->objectid) {
                ret = -ENOSPC;
        } else if (ins->objectid) {
+               if (!use_cluster && last_ptr) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->window_start = ins->objectid;
+                       spin_unlock(&last_ptr->lock);
+               }
                ret = 0;
        }
  out:
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
+               spin_lock(&space_info->lock);
+               space_info->max_extent_size = max_extent_size;
+               spin_unlock(&space_info->lock);
                ins->offset = max_extent_size;
+       }
        return ret;
  }
  
@@@ -7327,7 -7472,7 +7527,7 @@@ int btrfs_reserve_extent(struct btrfs_r
                         u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc)
  {
-       bool final_tried = false;
+       bool final_tried = num_bytes == min_alloc_size;
        u64 flags;
        int ret;
  
@@@ -8929,7 -9074,7 +9129,7 @@@ again
         * back off and let this transaction commit
         */
        mutex_lock(&root->fs_info->ro_block_group_mutex);
-       if (trans->transaction->dirty_bg_run) {
+       if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                u64 transid = trans->transid;
  
                mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@@ -9679,6 -9824,14 +9879,14 @@@ int btrfs_make_block_group(struct btrfs
  
        free_excluded_extents(root, cache);
  
+ #ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(root, cache)) {
+               u64 new_bytes_used = size - bytes_used;
+               bytes_used += new_bytes_used >> 1;
+               fragment_free_space(root, cache);
+       }
+ #endif
        /*
         * Call to ensure the corresponding space_info object is created and
         * assigned to our block group, but don't update its counters just yet.
diff --combined fs/btrfs/inode.c
@@@ -310,13 -310,6 +310,13 @@@ static noinline int cow_file_range_inli
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
  out:
 +      /*
 +       * Don't forget to free the reserved space, as for inlined extent
 +       * it won't count as data extent, free them directly here.
 +       * And at reserve time, it's always aligned to page size, so
 +       * just free one page here.
 +       */
 +      btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
        btrfs_free_path(path);
        btrfs_end_transaction(trans, root);
        return ret;
@@@ -1776,8 -1769,7 +1776,8 @@@ static void btrfs_clear_bit_hook(struc
  
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
 -                      btrfs_free_reserved_data_space(inode, len);
 +                      btrfs_free_reserved_data_space_noquota(inode,
 +                                      state->start, len);
  
                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
                                     root->fs_info->delalloc_batch);
@@@ -2000,8 -1992,7 +2000,8 @@@ again
                goto again;
        }
  
 -      ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 +      ret = btrfs_delalloc_reserve_space(inode, page_start,
 +                                         PAGE_CACHE_SIZE);
        if (ret) {
                mapping_set_error(page->mapping, ret);
                end_extent_writepage(page, ret, page_start, page_end);
@@@ -2128,16 -2119,6 +2128,16 @@@ static int insert_reserved_file_extent(
        ret = btrfs_alloc_reserved_file_extent(trans, root,
                                        root->root_key.objectid,
                                        btrfs_ino(inode), file_pos, &ins);
 +      if (ret < 0)
 +              goto out;
 +      /*
 +       * Release the reserved range from inode dirty range map, and
 +       * move it to delayed ref codes, as now accounting only happens at
 +       * commit_transaction() time.
 +       */
 +      btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
 +      ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
 +                      root->objectid, disk_bytenr, ram_bytes);
  out:
        btrfs_free_path(path);
  
@@@ -2845,14 -2826,6 +2845,14 @@@ static int btrfs_finish_ordered_io(stru
  
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
 +
 +              /*
 +               * For mwrite(mmap + memset to write) case, we still reserve
 +               * space for NOCOW range.
 +               * As NOCOW won't cause a new delayed ref, just free the space
 +               */
 +              btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
 +                                     ordered_extent->len);
                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (nolock)
                        trans = btrfs_join_transaction_nolock(root);
@@@ -4655,17 -4628,14 +4655,17 @@@ int btrfs_truncate_page(struct inode *i
        if ((offset & (blocksize - 1)) == 0 &&
            (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
 -      ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 +      ret = btrfs_delalloc_reserve_space(inode,
 +                      round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
        if (ret)
                goto out;
  
  again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
 -              btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 +              btrfs_delalloc_release_space(inode,
 +                              round_down(from, PAGE_CACHE_SIZE),
 +                              PAGE_CACHE_SIZE);
                ret = -ENOMEM;
                goto out;
        }
  
  out_unlock:
        if (ret)
 -              btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 +              btrfs_delalloc_release_space(inode, page_start,
 +                                           PAGE_CACHE_SIZE);
        unlock_page(page);
        page_cache_release(page);
  out:
@@@ -5132,18 -5101,6 +5132,18 @@@ static void evict_inode_truncate_pages(
                spin_unlock(&io_tree->lock);
  
                lock_extent_bits(io_tree, start, end, 0, &cached_state);
 +
 +              /*
 +               * If still has DELALLOC flag, the extent didn't reach disk,
 +               * and its reserved space won't be freed by delayed_ref.
 +               * So we need to free its reserved space here.
 +               * (Refer to comment in btrfs_invalidatepage, case 2)
 +               *
 +               * Note, end is the bytenr of last byte, so we need + 1 here.
 +               */
 +              if (state->state & EXTENT_DELALLOC)
 +                      btrfs_qgroup_free_data(inode, start, end - start + 1);
 +
                clear_extent_bit(io_tree, start, end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@@ -7677,7 -7634,7 +7677,7 @@@ unlock
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
  
 -              btrfs_free_reserved_data_space(inode, len);
 +              btrfs_free_reserved_data_space(inode, start, len);
                WARN_ON(dio_data->reserve < len);
                dio_data->reserve -= len;
                current->journal_info = dio_data;
@@@ -8467,7 -8424,7 +8467,7 @@@ static ssize_t btrfs_direct_IO(struct k
                        mutex_unlock(&inode->i_mutex);
                        relock = true;
                }
 -              ret = btrfs_delalloc_reserve_space(inode, count);
 +              ret = btrfs_delalloc_reserve_space(inode, offset, count);
                if (ret)
                        goto out;
                dio_data.outstanding_extents = div64_u64(count +
                current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED) {
                        if (dio_data.reserve)
 -                              btrfs_delalloc_release_space(inode,
 -                                                      dio_data.reserve);
 +                              btrfs_delalloc_release_space(inode, offset,
 +                                                           dio_data.reserve);
                } else if (ret >= 0 && (size_t)ret < count)
 -                      btrfs_delalloc_release_space(inode,
 +                      btrfs_delalloc_release_space(inode, offset,
                                                     count - (size_t)ret);
        }
  out:
@@@ -8658,18 -8615,6 +8658,18 @@@ static void btrfs_invalidatepage(struc
                }
        }
  
 +      /*
 +       * Qgroup reserved space handler
 +       * Page here will be either
 +       * 1) Already written to disk
 +       *    In this case, its reserved space is released from data rsv map
 +       *    and will be freed by delayed_ref handler finally.
 +       *    So even we call qgroup_free_data(), it won't decrease reserved
 +       *    space.
 +       * 2) Not written to disk
 +       *    This means the reserved space should be freed here.
 +       */
 +      btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
        if (!inode_evicting) {
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
@@@ -8720,11 -8665,7 +8720,11 @@@ int btrfs_page_mkwrite(struct vm_area_s
        u64 page_end;
  
        sb_start_pagefault(inode->i_sb);
 -      ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 +      page_start = page_offset(page);
 +      page_end = page_start + PAGE_CACHE_SIZE - 1;
 +
 +      ret = btrfs_delalloc_reserve_space(inode, page_start,
 +                                         PAGE_CACHE_SIZE);
        if (!ret) {
                ret = file_update_time(vma->vm_file);
                reserved = 1;
  again:
        lock_page(page);
        size = i_size_read(inode);
 -      page_start = page_offset(page);
 -      page_end = page_start + PAGE_CACHE_SIZE - 1;
  
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
@@@ -8819,7 -8762,7 +8819,7 @@@ out_unlock
        }
        unlock_page(page);
  out:
 -      btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 +      btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
  out_noreserve:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@@ -9108,7 -9051,6 +9108,7 @@@ void btrfs_destroy_inode(struct inode *
                        btrfs_put_ordered_extent(ordered);
                }
        }
 +      btrfs_qgroup_check_reserved_leak(inode);
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
  free:
@@@ -9745,6 -9687,7 +9745,7 @@@ static int __btrfs_prealloc_file_range(
        u64 cur_offset = start;
        u64 i_size;
        u64 cur_bytes;
+       u64 last_alloc = (u64)-1;
        int ret = 0;
        bool own_trans = true;
  
  
                cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
                cur_bytes = max(cur_bytes, min_size);
+               /*
+                * If we are severely fragmented we could end up with really
+                * small allocations, so if the allocator is returning small
+                * chunks lets make its job easier by only searching for those
+                * sized chunks.
+                */
+               cur_bytes = min(cur_bytes, last_alloc);
                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
                                           *alloc_hint, &ins, 1, 0);
                if (ret) {
                        break;
                }
  
+               last_alloc = ins.offset;
                ret = insert_reserved_file_extent(trans, inode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,
diff --combined fs/btrfs/transaction.c
@@@ -232,15 -232,16 +232,16 @@@ loop
        extwriter_counter_init(cur_trans, type);
        init_waitqueue_head(&cur_trans->writer_wait);
        init_waitqueue_head(&cur_trans->commit_wait);
+       init_waitqueue_head(&cur_trans->pending_wait);
        cur_trans->state = TRANS_STATE_RUNNING;
        /*
         * One for this trans handle, one so it will live on until we
         * commit the transaction.
         */
        atomic_set(&cur_trans->use_count, 2);
-       cur_trans->have_free_bgs = 0;
+       atomic_set(&cur_trans->pending_ordered, 0);
+       cur_trans->flags = 0;
        cur_trans->start_time = get_seconds();
-       cur_trans->dirty_bg_run = 0;
  
        memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
  
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
-       INIT_LIST_HEAD(&cur_trans->pending_ordered);
        INIT_LIST_HEAD(&cur_trans->dirty_bgs);
        INIT_LIST_HEAD(&cur_trans->io_bgs);
        INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@@ -480,10 -480,13 +480,10 @@@ start_transaction(struct btrfs_root *ro
         * the appropriate flushing if need be.
         */
        if (num_items > 0 && root != root->fs_info->chunk_root) {
 -              if (root->fs_info->quota_enabled &&
 -                  is_fstree(root->root_key.objectid)) {
 -                      qgroup_reserved = num_items * root->nodesize;
 -                      ret = btrfs_qgroup_reserve(root, qgroup_reserved);
 -                      if (ret)
 -                              return ERR_PTR(ret);
 -              }
 +              qgroup_reserved = num_items * root->nodesize;
 +              ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
 +              if (ret)
 +                      return ERR_PTR(ret);
  
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                /*
@@@ -544,12 -547,10 +544,11 @@@ again
        h->transaction = cur_trans;
        h->root = root;
        h->use_count = 1;
 +
        h->type = type;
        h->can_flush_pending_bgs = true;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
-       INIT_LIST_HEAD(&h->ordered);
  
        smp_mb();
        if (cur_trans->state >= TRANS_STATE_BLOCKED &&
                h->bytes_reserved = num_bytes;
                h->reloc_reserved = reloc_reserved;
        }
 -      h->qgroup_reserved = qgroup_reserved;
  
  got_it:
        btrfs_record_root_in_trans(h, root);
@@@ -583,7 -585,8 +582,7 @@@ alloc_fail
                btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                        num_bytes);
  reserve_fail:
 -      if (qgroup_reserved)
 -              btrfs_qgroup_free(root, qgroup_reserved);
 +      btrfs_qgroup_free_meta(root, qgroup_reserved);
        return ERR_PTR(ret);
  }
  
@@@ -780,12 -783,6 +779,6 @@@ static int __btrfs_end_transaction(stru
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
  
-       if (!list_empty(&trans->ordered)) {
-               spin_lock(&info->trans_lock);
-               list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
-               spin_unlock(&info->trans_lock);
-       }
        trans->delayed_ref_updates = 0;
        if (!trans->sync) {
                must_run_delayed_refs =
                        must_run_delayed_refs = 2;
        }
  
 -      if (trans->qgroup_reserved) {
 -              /*
 -               * the same root has to be passed here between start_transaction
 -               * and end_transaction. Subvolume quota depends on this.
 -               */
 -              btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
 -              trans->qgroup_reserved = 0;
 -      }
 -
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
  
@@@ -1218,7 -1224,6 +1211,7 @@@ static noinline int commit_fs_roots(str
                        spin_lock(&fs_info->fs_roots_radix_lock);
                        if (err)
                                break;
 +                      btrfs_qgroup_free_meta_all(root);
                }
        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
@@@ -1776,25 -1781,10 +1769,10 @@@ static inline void btrfs_wait_delalloc_
  }
  
  static inline void
- btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
-                          struct btrfs_fs_info *fs_info)
+ btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
  {
-       struct btrfs_ordered_extent *ordered;
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
-               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-                                                  &ordered->flags));
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
+       wait_event(cur_trans->pending_wait,
+                  atomic_read(&cur_trans->pending_ordered) == 0);
  }
  
  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
 -      if (trans->qgroup_reserved) {
 -              btrfs_qgroup_free(root, trans->qgroup_reserved);
 -              trans->qgroup_reserved = 0;
 -      }
  
        cur_trans = trans->transaction;
  
                return ret;
        }
  
-       if (!cur_trans->dirty_bg_run) {
+       if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
                int run_it = 0;
  
                /* this mutex is also taken before trying to set
                 * after a extents from that block group have been
                 * allocated for cache files.  btrfs_set_block_group_ro
                 * will wait for the transaction to commit if it
-                * finds dirty_bg_run = 1
+                * finds BTRFS_TRANS_DIRTY_BG_RUN set.
                 *
-                * The dirty_bg_run flag is also used to make sure only
-                * one process starts all the block group IO.  It wouldn't
+                * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+                * only one process starts all the block group IO.  It wouldn't
                 * hurt to have more than one go through, but there's no
                 * real advantage to it either.
                 */
                mutex_lock(&root->fs_info->ro_block_group_mutex);
-               if (!cur_trans->dirty_bg_run) {
+               if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+                                     &cur_trans->flags))
                        run_it = 1;
-                       cur_trans->dirty_bg_run = 1;
-               }
                mutex_unlock(&root->fs_info->ro_block_group_mutex);
  
                if (run_it)
        }
  
        spin_lock(&root->fs_info->trans_lock);
-       list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
        if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                spin_unlock(&root->fs_info->trans_lock);
                atomic_inc(&cur_trans->use_count);
  
        btrfs_wait_delalloc_flush(root->fs_info);
  
-       btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+       btrfs_wait_pending_ordered(cur_trans);
  
        btrfs_scrub_pause(root);
        /*
  
        btrfs_finish_extent_commit(trans, root);
  
-       if (cur_trans->have_free_bgs)
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
                btrfs_clear_space_info_full(root->fs_info);
  
        root->fs_info->last_trans_committed = cur_trans->transid;
@@@ -2175,6 -2167,10 +2151,6 @@@ cleanup_transaction
        btrfs_trans_release_metadata(trans, root);
        btrfs_trans_release_chunk_metadata(trans);
        trans->block_rsv = NULL;
 -      if (trans->qgroup_reserved) {
 -              btrfs_qgroup_free(root, trans->qgroup_reserved);
 -              trans->qgroup_reserved = 0;
 -      }
        btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
        if (current->journal_info == trans)
                current->journal_info = NULL;
diff --combined fs/btrfs/transaction.h
@@@ -32,6 -32,10 +32,10 @@@ enum btrfs_trans_state 
        TRANS_STATE_MAX                 = 6,
  };
  
+ #define BTRFS_TRANS_HAVE_FREE_BGS     0
+ #define BTRFS_TRANS_DIRTY_BG_RUN      1
+ #define BTRFS_TRANS_CACHE_ENOSPC      2
  struct btrfs_transaction {
        u64 transid;
        /*
         */
        atomic_t num_writers;
        atomic_t use_count;
+       atomic_t pending_ordered;
  
-       /*
-        * true if there is free bgs operations in this transaction
-        */
-       int have_free_bgs;
+       unsigned long flags;
  
        /* Be protected by fs_info->trans_lock when we want to change it. */
        enum btrfs_trans_state state;
@@@ -59,9 -61,9 +61,9 @@@
        unsigned long start_time;
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
+       wait_queue_head_t pending_wait;
        struct list_head pending_snapshots;
        struct list_head pending_chunks;
-       struct list_head pending_ordered;
        struct list_head switch_commits;
        struct list_head dirty_bgs;
        struct list_head io_bgs;
@@@ -80,7 -82,6 +82,6 @@@
        spinlock_t dropped_roots_lock;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
-       int dirty_bg_run;
  };
  
  #define __TRANS_FREEZABLE     (1U << 0)
@@@ -107,6 -108,7 +108,6 @@@ struct btrfs_trans_handle 
        u64 transid;
        u64 bytes_reserved;
        u64 chunk_bytes_reserved;
 -      u64 qgroup_reserved;
        unsigned long use_count;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
         */
        struct btrfs_root *root;
        struct seq_list delayed_ref_elem;
-       struct list_head ordered;
        struct list_head qgroup_ref_list;
        struct list_head new_bgs;
  };