Merge branch 'allocator-fixes' into for-linus-4.4

author Chris Mason <clm@fb.com>

Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)

committer Chris Mason <clm@fb.com>

Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
author Chris Mason <clm@fb.com>
Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
committer Chris Mason <clm@fb.com>
Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
diff --combined fs/btrfs/ctree.h

index 2135b82,ced08c9..bc3c711
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -1154,6 -1154,10 +1154,10 @@@ struct btrfs_space_info 
                                    delalloc/allocations */
         u64 bytes_readonly;     /* total bytes that are read only */
   
+       u64 max_extent_size;    /* This will hold the maximum extent size of
+                                  the space info if we had an ENOSPC in the
+                                  allocator. */
+ 
         unsigned int full:1;    /* indicates that we cannot allocate any more
                                    chunks for this space */
         unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
@@@ -1228,6 -1232,9 +1232,9 @@@ struct btrfs_free_cluster 
         /* first extent starting offset */
         u64 window_start;
   
+       /* We did a full search and couldn't create a cluster */
+       bool fragmented;
+ 
         struct btrfs_block_group_cache *block_group;
         /*
          * when a cluster is allocated from a block group, we put the
@@@ -1943,9 -1950,6 +1950,9 @@@ struct btrfs_root 
         int send_in_progress;
         struct btrfs_subvolume_writers *subv_writers;
         atomic_t will_be_snapshoted;
+ +
+ +      /* For qgroup metadata space reserve */
+ +      atomic_t qgroup_meta_rsv;
   };
   
   struct btrfs_ioctl_defrag_range_args {
@@@ -2148,6 -2152,8 +2155,8 @@@
   #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
   #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR      (1 << 22)
   #define BTRFS_MOUNT_RESCAN_UUID_TREE  (1 << 23)
+ #define BTRFS_MOUNT_FRAGMENT_DATA     (1 << 24)
+ #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
   
   #define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
   #define BTRFS_DEFAULT_MAX_INLINE      (8192)
@@@ -2172,6 -2178,18 +2181,18 @@@
         btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
   }
   
+ #ifdef CONFIG_BTRFS_DEBUG
+ static inline int
+ btrfs_should_fragment_free_space(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *block_group)
+ {
+       return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+               block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+              (btrfs_test_opt(root, FRAGMENT_DATA) &&
+               block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+ }
+ #endif
+ 
   /*
    * Requests for changes that need to be done during transaction commit.
    *
@@@ -3452,11 -3470,8 +3473,11 @@@ enum btrfs_reserve_flush_enum 
         BTRFS_RESERVE_FLUSH_ALL,
   };
   
- -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
- -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+ +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ +                                          u64 len);
   void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
   void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@@ -3472,8 -3487,8 +3493,8 @@@ void btrfs_subvolume_release_metadata(s
                                       u64 qgroup_reserved);
   int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
   void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
- -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
- -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+ +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+ +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
   void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
   struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
                                               unsigned short type);
diff --combined fs/btrfs/disk-io.c

index 0e0544e,dceabb1..86a11a9
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -1267,7 -1267,6 +1267,7 @@@ static void __setup_root(u32 nodesize, 
         atomic_set(&root->orphan_inodes, 0);
         atomic_set(&root->refs, 1);
         atomic_set(&root->will_be_snapshoted, 0);
+ +      atomic_set(&root->qgroup_meta_rsv, 0);
         root->log_transid = 0;
         root->log_transid_committed = -1;
         root->last_log_commit = 0;
@@@ -4327,25 -4326,6 +4327,6 @@@ again
         return 0;
   }
   
- static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
-                                      struct btrfs_fs_info *fs_info)
- {
-       struct btrfs_ordered_extent *ordered;
- 
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
- 
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
- }
- 
   void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                    struct btrfs_root *root)
   {
@@@ -4357,7 -4337,6 +4338,6 @@@
         cur_trans->state = TRANS_STATE_UNBLOCKED;
         wake_up(&root->fs_info->transaction_wait);
   
-       btrfs_free_pending_ordered(cur_trans, root->fs_info);
         btrfs_destroy_delayed_inodes(root);
         btrfs_assert_delayed_root_empty(root);
   
diff --combined fs/btrfs/extent-tree.c

index 4660960,4757645..92fdbc6
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -332,6 -332,27 +332,27 @@@ static void put_caching_control(struct 
                 kfree(ctl);
   }
   
+ #ifdef CONFIG_BTRFS_DEBUG
+ static void fragment_free_space(struct btrfs_root *root,
+                               struct btrfs_block_group_cache *block_group)
+ {
+       u64 start = block_group->key.objectid;
+       u64 len = block_group->key.offset;
+       u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+               root->nodesize : root->sectorsize;
+       u64 step = chunk << 1;
+ 
+       while (len > chunk) {
+               btrfs_remove_free_space(block_group, start, chunk);
+               start += step;
+               if (len < step)
+                       len = 0;
+               else
+                       len -= step;
+       }
+ }
+ #endif
+ 
   /*
    * this is only called by cache_block_group, since we could have freed extents
    * we need to check the pinned_extents for any extents that can't be used yet
@@@ -388,6 -409,7 +409,7 @@@ static noinline void caching_thread(str
         u64 last = 0;
         u32 nritems;
         int ret = -ENOMEM;
+       bool wakeup = true;
   
         caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
@@@ -400,6 -422,15 +422,15 @@@
   
         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
   
+ #ifdef CONFIG_BTRFS_DEBUG
+       /*
+        * If we're fragmenting we don't want to make anybody think we can
+        * allocate from this block group until we've had a chance to fragment
+        * the free space.
+        */
+       if (btrfs_should_fragment_free_space(extent_root, block_group))
+               wakeup = false;
+ #endif
         /*
          * We don't want to deadlock with somebody trying to allocate a new
          * extent for the extent root while also trying to search the extent
@@@ -441,7 -472,8 +472,8 @@@ next
   
                         if (need_resched() ||
                             rwsem_is_contended(&fs_info->commit_root_sem)) {
-                               caching_ctl->progress = last;
+                               if (wakeup)
+                                       caching_ctl->progress = last;
                                 btrfs_release_path(path);
                                 up_read(&fs_info->commit_root_sem);
                                 mutex_unlock(&caching_ctl->mutex);
@@@ -464,7 -496,8 +496,8 @@@
                         key.offset = 0;
                         key.type = BTRFS_EXTENT_ITEM_KEY;
   
-                       caching_ctl->progress = last;
+                       if (wakeup)
+                               caching_ctl->progress = last;
                         btrfs_release_path(path);
                         goto next;
                 }
@@@ -491,7 -524,8 +524,8 @@@
   
                         if (total_found > (1024 * 1024 * 2)) {
                                 total_found = 0;
-                               wake_up(&caching_ctl->wait);
+                               if (wakeup)
+                                       wake_up(&caching_ctl->wait);
                         }
                 }
                 path->slots[0]++;
@@@ -501,13 -535,27 +535,27 @@@
         total_found += add_new_free_space(block_group, fs_info, last,
                                           block_group->key.objectid +
                                           block_group->key.offset);
-       caching_ctl->progress = (u64)-1;
- 
         spin_lock(&block_group->lock);
         block_group->caching_ctl = NULL;
         block_group->cached = BTRFS_CACHE_FINISHED;
         spin_unlock(&block_group->lock);
   
+ #ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+               u64 bytes_used;
+ 
+               spin_lock(&block_group->space_info->lock);
+               spin_lock(&block_group->lock);
+               bytes_used = block_group->key.offset -
+                       btrfs_block_group_used(&block_group->item);
+               block_group->space_info->bytes_used += bytes_used >> 1;
+               spin_unlock(&block_group->lock);
+               spin_unlock(&block_group->space_info->lock);
+               fragment_free_space(extent_root, block_group);
+       }
+ #endif
+ 
+       caching_ctl->progress = (u64)-1;
   err:
         btrfs_free_path(path);
         up_read(&fs_info->commit_root_sem);
@@@ -607,6 -655,22 +655,22 @@@ static int cache_block_group(struct btr
                         }
                 }
                 spin_unlock(&cache->lock);
+ #ifdef CONFIG_BTRFS_DEBUG
+               if (ret == 1 &&
+                   btrfs_should_fragment_free_space(fs_info->extent_root,
+                                                    cache)) {
+                       u64 bytes_used;
+ 
+                       spin_lock(&cache->space_info->lock);
+                       spin_lock(&cache->lock);
+                       bytes_used = cache->key.offset -
+                               btrfs_block_group_used(&cache->item);
+                       cache->space_info->bytes_used += bytes_used >> 1;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
+                       fragment_free_space(fs_info->extent_root, cache);
+               }
+ #endif
                 mutex_unlock(&caching_ctl->mutex);
   
                 wake_up(&caching_ctl->wait);
@@@ -2345,11 -2409,6 +2409,11 @@@ static int run_one_delayed_ref(struct b
                                                       node->num_bytes);
                         }
                 }
+ +
+ +              /* Also free its reserved qgroup space */
+ +              btrfs_qgroup_free_delayed_ref(root->fs_info,
+ +                                            head->qgroup_ref_root,
+ +                                            head->qgroup_reserved);
                 return ret;
         }
   
@@@ -3343,6 -3402,15 +3407,15 @@@ again
         }
         spin_unlock(&block_group->lock);
   
+       /*
+        * We hit an ENOSPC when setting up the cache in this transaction, just
+        * skip doing the setup, we've already cleared the cache so we're safe.
+        */
+       if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+               ret = -ENOSPC;
+               goto out_put;
+       }
+ 
         /*
          * Try to preallocate enough space based on how big the block group is.
          * Keep in mind this has to include any pinned space which could end up
@@@ -3356,16 -3424,26 +3429,26 @@@
         num_pages *= 16;
         num_pages *= PAGE_CACHE_SIZE;
   
- -      ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ +      ret = btrfs_check_data_free_space(inode, 0, num_pages);
         if (ret)
                 goto out_put;
   
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                               num_pages, num_pages,
                                               &alloc_hint);
+       /*
+        * Our cache requires contiguous chunks so that we don't modify a bunch
+        * of metadata or split extents when writing the cache out, which means
+        * we can enospc if we are heavily fragmented in addition to just normal
+        * out of space conditions.  So if we hit this just skip setting up any
+        * other block groups for this transaction, maybe we'll unpin enough
+        * space the next time around.
+        */
         if (!ret)
                 dcs = BTRFS_DC_SETUP;
- -      btrfs_free_reserved_data_space(inode, num_pages);
+       else if (ret == -ENOSPC)
+               set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ +      btrfs_free_reserved_data_space(inode, 0, num_pages);
   
   out_put:
         iput(inode);
@@@ -3751,6 -3829,7 +3834,7 @@@ static int update_space_info(struct btr
         found->bytes_readonly = 0;
         found->bytes_may_use = 0;
         found->full = 0;
+       found->max_extent_size = 0;
         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
         found->chunk_alloc = 0;
         found->flush = 0;
@@@ -3904,7 -3983,11 +3988,7 @@@ u64 btrfs_get_alloc_profile(struct btrf
         return ret;
   }
   
- -/*
- - * This will check the space that the inode allocates from to make sure we have
- - * enough space for bytes.
- - */
- -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+ +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
   {
         struct btrfs_space_info *data_sinfo;
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -4003,7 -4086,8 +4087,8 @@@ commit_trans
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
                         if (have_pinned_space >= 0 ||
-                           trans->transaction->have_free_bgs ||
+                           test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+                                    &trans->transaction->flags) ||
                             need_commit > 0) {
                                 ret = btrfs_commit_transaction(trans, root);
                                 if (ret)
@@@ -4025,86 -4109,38 +4110,86 @@@
                                               data_sinfo->flags, bytes, 1);
                 return -ENOSPC;
         }
- -      ret = btrfs_qgroup_reserve(root, write_bytes);
- -      if (ret)
- -              goto out;
         data_sinfo->bytes_may_use += bytes;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
                                       data_sinfo->flags, bytes, 1);
- -out:
         spin_unlock(&data_sinfo->lock);
   
         return ret;
   }
   
   /*
- - * Called if we need to clear a data reservation for this inode.
+ + * New check_data_free_space() with ability for precious data reservation
+ + * Will replace old btrfs_check_data_free_space(), but for patch split,
+ + * add a new function first and then replace it.
+ + */
+ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+ +{
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      int ret;
+ +
+ +      /* align the range */
+ +      len = round_up(start + len, root->sectorsize) -
+ +            round_down(start, root->sectorsize);
+ +      start = round_down(start, root->sectorsize);
+ +
+ +      ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ +      if (ret < 0)
+ +              return ret;
+ +
+ +      /*
+ +       * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ +       *
+ +       * TODO: Find a good method to avoid reserve data space for NOCOW
+ +       * range, but don't impact performance on quota disable case.
+ +       */
+ +      ret = btrfs_qgroup_reserve_data(inode, start, len);
+ +      return ret;
+ +}
+ +
+ +/*
+ + * Called if we need to clear a data reservation for this inode
+ + * Normally in a error case.
+ + *
+ + * This one will *NOT* use accurate qgroup reserved space API, just for case
+ + * which we can't sleep and is sure it won't affect qgroup reserved space.
+ + * Like clear_bit_hook().
    */
- -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ +                                          u64 len)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_space_info *data_sinfo;
   
- -      /* make sure bytes are sectorsize aligned */
- -      bytes = ALIGN(bytes, root->sectorsize);
+ +      /* Make sure the range is aligned to sectorsize */
+ +      len = round_up(start + len, root->sectorsize) -
+ +            round_down(start, root->sectorsize);
+ +      start = round_down(start, root->sectorsize);
   
         data_sinfo = root->fs_info->data_sinfo;
         spin_lock(&data_sinfo->lock);
- -      WARN_ON(data_sinfo->bytes_may_use < bytes);
- -      data_sinfo->bytes_may_use -= bytes;
+ +      if (WARN_ON(data_sinfo->bytes_may_use < len))
+ +              data_sinfo->bytes_may_use = 0;
+ +      else
+ +              data_sinfo->bytes_may_use -= len;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
- -                                    data_sinfo->flags, bytes, 0);
+ +                                    data_sinfo->flags, len, 0);
         spin_unlock(&data_sinfo->lock);
   }
   
+ +/*
+ + * Called if we need to clear a data reservation for this inode
+ + * Normally in a error case.
+ + *
+ + * This one will handle the per-indoe data rsv map for accurate reserved
+ + * space framework.
+ + */
+ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+ +{
+ +      btrfs_free_reserved_data_space_noquota(inode, start, len);
+ +      btrfs_qgroup_free_data(inode, start, len);
+ +}
+ +
   static void force_metadata_allocation(struct btrfs_fs_info *info)
   {
         struct list_head *head = &info->space_info;
@@@ -5381,7 -5417,7 +5466,7 @@@ int btrfs_subvolume_reserve_metadata(st
         if (root->fs_info->quota_enabled) {
                 /* One for parent inode, two for dir entries */
                 num_bytes = 3 * root->nodesize;
- -              ret = btrfs_qgroup_reserve(root, num_bytes);
+ +              ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                 if (ret)
                         return ret;
         } else {
@@@ -5399,8 -5435,10 +5484,8 @@@
         if (ret == -ENOSPC && use_global_rsv)
                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
   
- -      if (ret) {
- -              if (*qgroup_reserved)
- -                      btrfs_qgroup_free(root, *qgroup_reserved);
- -      }
+ +      if (ret && *qgroup_reserved)
+ +              btrfs_qgroup_free_meta(root, *qgroup_reserved);
   
         return ret;
   }
@@@ -5561,15 -5599,15 +5646,15 @@@ int btrfs_delalloc_reserve_metadata(str
         spin_unlock(&BTRFS_I(inode)->lock);
   
         if (root->fs_info->quota_enabled) {
- -              ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ +              ret = btrfs_qgroup_reserve_meta(root,
+ +                              nr_extents * root->nodesize);
                 if (ret)
                         goto out_fail;
         }
   
         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
         if (unlikely(ret)) {
- -              if (root->fs_info->quota_enabled)
- -                      btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ +              btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                 goto out_fail;
         }
   
@@@ -5692,48 -5730,41 +5777,48 @@@ void btrfs_delalloc_release_metadata(st
   }
   
   /**
- - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ + * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ + * delalloc
    * @inode: inode we're writing to
- - * @num_bytes: the number of bytes we want to allocate
+ + * @start: start range we are writing to
+ + * @len: how long the range we are writing to
+ + *
+ + * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
    *
    * This will do the following things
    *
- - * o reserve space in the data space info for num_bytes
- - * o reserve space in the metadata space info based on number of outstanding
+ + * o reserve space in data space info for num bytes
+ + *   and reserve precious corresponding qgroup space
+ + *   (Done in check_data_free_space)
+ + *
+ + * o reserve space for metadata space, based on the number of outstanding
    *   extents and how much csums will be needed
- - * o add to the inodes ->delalloc_bytes
+ + *   also reserve metadata space in a per root over-reserve method.
+ + * o add to the inodes->delalloc_bytes
    * o add it to the fs_info's delalloc inodes list.
+ + *   (Above 3 all done in delalloc_reserve_metadata)
    *
- - * This will return 0 for success and -ENOSPC if there is no space left.
+ + * Return 0 for success
+ + * Return <0 for error(-ENOSPC or -EQUOT)
    */
- -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+ +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
   {
         int ret;
   
- -      ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- -      if (ret)
- -              return ret;
- -
- -      ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- -      if (ret) {
- -              btrfs_free_reserved_data_space(inode, num_bytes);
+ +      ret = btrfs_check_data_free_space(inode, start, len);
+ +      if (ret < 0)
                 return ret;
- -      }
- -
- -      return 0;
+ +      ret = btrfs_delalloc_reserve_metadata(inode, len);
+ +      if (ret < 0)
+ +              btrfs_free_reserved_data_space(inode, start, len);
+ +      return ret;
   }
   
   /**
    * btrfs_delalloc_release_space - release data and metadata space for delalloc
    * @inode: inode we're releasing space for
- - * @num_bytes: the number of bytes we want to free up
+ + * @start: start position of the space already reserved
+ + * @len: the len of the space already reserved
    *
    * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
    * called in the case that we don't need the metadata AND data reservations
@@@ -5742,12 -5773,11 +5827,12 @@@
    * This function will release the metadata space that was not used and will
    * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
    * list if there are no delalloc bytes left.
+ + * Also it will handle the qgroup reserved space.
    */
- -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+ +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
   {
- -      btrfs_delalloc_release_metadata(inode, num_bytes);
- -      btrfs_free_reserved_data_space(inode, num_bytes);
+ +      btrfs_delalloc_release_metadata(inode, len);
+ +      btrfs_free_reserved_data_space(inode, start, len);
   }
   
   static int update_block_group(struct btrfs_trans_handle *trans,
@@@ -6112,6 -6142,34 +6197,34 @@@ void btrfs_prepare_extent_commit(struc
         update_global_block_rsv(fs_info);
   }
   
+ /*
+  * Returns the free cluster for the given space info and sets empty_cluster to
+  * what it should be based on the mount options.
+  */
+ static struct btrfs_free_cluster *
+ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+                  u64 *empty_cluster)
+ {
+       struct btrfs_free_cluster *ret = NULL;
+       bool ssd = btrfs_test_opt(root, SSD);
+ 
+       *empty_cluster = 0;
+       if (btrfs_mixed_space_info(space_info))
+               return ret;
+ 
+       if (ssd)
+               *empty_cluster = 2 * 1024 * 1024;
+       if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+               ret = &root->fs_info->meta_alloc_cluster;
+               if (!ssd)
+                       *empty_cluster = 64 * 1024;
+       } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+               ret = &root->fs_info->data_alloc_cluster;
+       }
+ 
+       return ret;
+ }
+ 
   static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                               const bool return_free_space)
   {
@@@ -6119,7 -6177,10 +6232,10 @@@
         struct btrfs_block_group_cache *cache = NULL;
         struct btrfs_space_info *space_info;
         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       struct btrfs_free_cluster *cluster = NULL;
         u64 len;
+       u64 total_unpinned = 0;
+       u64 empty_cluster = 0;
         bool readonly;
   
         while (start <= end) {
@@@ -6128,8 -6189,14 +6244,14 @@@
                     start >= cache->key.objectid + cache->key.offset) {
                         if (cache)
                                 btrfs_put_block_group(cache);
+                       total_unpinned = 0;
                         cache = btrfs_lookup_block_group(fs_info, start);
                         BUG_ON(!cache); /* Logic error */
+ 
+                       cluster = fetch_cluster_info(root,
+                                                    cache->space_info,
+                                                    &empty_cluster);
+                       empty_cluster <<= 1;
                 }
   
                 len = cache->key.objectid + cache->key.offset - start;
@@@ -6142,12 -6209,27 +6264,27 @@@
                 }
   
                 start += len;
+               total_unpinned += len;
                 space_info = cache->space_info;
   
+               /*
+                * If this space cluster has been marked as fragmented and we've
+                * unpinned enough in this block group to potentially allow a
+                * cluster to be created inside of it go ahead and clear the
+                * fragmented check.
+                */
+               if (cluster && cluster->fragmented &&
+                   total_unpinned > empty_cluster) {
+                       spin_lock(&cluster->lock);
+                       cluster->fragmented = 0;
+                       spin_unlock(&cluster->lock);
+               }
+ 
                 spin_lock(&space_info->lock);
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
                 space_info->bytes_pinned -= len;
+               space_info->max_extent_size = 0;
                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
                 if (cache->ro) {
                         space_info->bytes_readonly += len;
@@@ -6880,7 -6962,7 +7017,7 @@@ static noinline int find_free_extent(st
         struct btrfs_block_group_cache *block_group = NULL;
         u64 search_start = 0;
         u64 max_extent_size = 0;
-       int empty_cluster = 2 * 1024 * 1024;
+       u64 empty_cluster = 0;
         struct btrfs_space_info *space_info;
         int loop = 0;
         int index = __get_raid_index(flags);
@@@ -6890,6 -6972,7 +7027,7 @@@
         bool failed_alloc = false;
         bool use_cluster = true;
         bool have_caching_bg = false;
+       bool full_search = false;
   
         WARN_ON(num_bytes < root->sectorsize);
         ins->type = BTRFS_EXTENT_ITEM_KEY;
@@@ -6905,36 -6988,47 +7043,47 @@@
         }
   
         /*
-        * If the space info is for both data and metadata it means we have a
-        * small filesystem and we can't use the clustering stuff.
+        * If our free space is heavily fragmented we may not be able to make
+        * big contiguous allocations, so instead of doing the expensive search
+        * for free space, simply return ENOSPC with our max_extent_size so we
+        * can go ahead and search for a more manageable chunk.
+        *
+        * If our max_extent_size is large enough for our allocation simply
+        * disable clustering since we will likely not be able to find enough
+        * space to create a cluster and induce latency trying.
          */
-       if (btrfs_mixed_space_info(space_info))
-               use_cluster = false;
- 
-       if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-               last_ptr = &root->fs_info->meta_alloc_cluster;
-               if (!btrfs_test_opt(root, SSD))
-                       empty_cluster = 64 * 1024;
-       }
- 
-       if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
-           btrfs_test_opt(root, SSD)) {
-               last_ptr = &root->fs_info->data_alloc_cluster;
+       if (unlikely(space_info->max_extent_size)) {
+               spin_lock(&space_info->lock);
+               if (space_info->max_extent_size &&
+                   num_bytes > space_info->max_extent_size) {
+                       ins->offset = space_info->max_extent_size;
+                       spin_unlock(&space_info->lock);
+                       return -ENOSPC;
+               } else if (space_info->max_extent_size) {
+                       use_cluster = false;
+               }
+               spin_unlock(&space_info->lock);
         }
   
+       last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
         if (last_ptr) {
                 spin_lock(&last_ptr->lock);
                 if (last_ptr->block_group)
                         hint_byte = last_ptr->window_start;
+               if (last_ptr->fragmented) {
+                       /*
+                        * We still set window_start so we can keep track of the
+                        * last place we found an allocation to try and save
+                        * some time.
+                        */
+                       hint_byte = last_ptr->window_start;
+                       use_cluster = false;
+               }
                 spin_unlock(&last_ptr->lock);
         }
   
         search_start = max(search_start, first_logical_byte(root, 0));
         search_start = max(search_start, hint_byte);
- 
-       if (!last_ptr)
-               empty_cluster = 0;
- 
         if (search_start == hint_byte) {
                 block_group = btrfs_lookup_block_group(root->fs_info,
                                                        search_start);
@@@ -6969,6 -7063,8 +7118,8 @@@
         }
   search:
         have_caching_bg = false;
+       if (index == 0 || index == __get_raid_index(flags))
+               full_search = true;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group, &space_info->block_groups[index],
                             list) {
@@@ -7002,6 -7098,7 +7153,7 @@@
   have_block_group:
                 cached = block_group_cache_done(block_group);
                 if (unlikely(!cached)) {
+                       have_caching_bg = true;
                         ret = cache_block_group(block_group, 0);
                         BUG_ON(ret < 0);
                         ret = 0;
@@@ -7016,7 -7113,7 +7168,7 @@@
                  * Ok we want to try and use the cluster allocator, so
                  * lets look there
                  */
-               if (last_ptr) {
+               if (last_ptr && use_cluster) {
                         struct btrfs_block_group_cache *used_block_group;
                         unsigned long aligned_cluster;
                         /*
@@@ -7142,6 -7239,16 +7294,16 @@@ refill_cluster
                 }
   
   unclustered_alloc:
+               /*
+                * We are doing an unclustered alloc, set the fragmented flag so
+                * we don't bother trying to setup a cluster again until we get
+                * more space.
+                */
+               if (unlikely(last_ptr)) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->fragmented = 1;
+                       spin_unlock(&last_ptr->lock);
+               }
                 spin_lock(&block_group->free_space_ctl->tree_lock);
                 if (cached &&
                     block_group->free_space_ctl->free_space <
@@@ -7174,8 -7281,6 +7336,6 @@@
                         failed_alloc = true;
                         goto have_block_group;
                 } else if (!offset) {
-                       if (!cached)
-                               have_caching_bg = true;
                         goto loop;
                 }
   checks:
@@@ -7232,7 -7337,20 +7392,20 @@@ loop
          */
         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                 index = 0;
-               loop++;
+               if (loop == LOOP_CACHING_NOWAIT) {
+                       /*
+                        * We want to skip the LOOP_CACHING_WAIT step if we
+                        * don't have any unached bgs and we've alrelady done a
+                        * full search through.
+                        */
+                       if (have_caching_bg || !full_search)
+                               loop = LOOP_CACHING_WAIT;
+                       else
+                               loop = LOOP_ALLOC_CHUNK;
+               } else {
+                       loop++;
+               }
+ 
                 if (loop == LOOP_ALLOC_CHUNK) {
                         struct btrfs_trans_handle *trans;
                         int exist = 0;
@@@ -7250,6 -7368,15 +7423,15 @@@
   
                         ret = do_chunk_alloc(trans, root, flags,
                                              CHUNK_ALLOC_FORCE);
+ 
+                       /*
+                        * If we can't allocate a new chunk we've already looped
+                        * through at least once, move on to the NO_EMPTY_SIZE
+                        * case.
+                        */
+                       if (ret == -ENOSPC)
+                               loop = LOOP_NO_EMPTY_SIZE;
+ 
                         /*
                          * Do not bail out on ENOSPC since we
                          * can do more things.
@@@ -7266,6 -7393,15 +7448,15 @@@
                 }
   
                 if (loop == LOOP_NO_EMPTY_SIZE) {
+                       /*
+                        * Don't loop again if we already have no empty_size and
+                        * no empty_cluster.
+                        */
+                       if (empty_size == 0 &&
+                           empty_cluster == 0) {
+                               ret = -ENOSPC;
+                               goto out;
+                       }
                         empty_size = 0;
                         empty_cluster = 0;
                 }
@@@ -7274,11 -7410,20 +7465,20 @@@
         } else if (!ins->objectid) {
                 ret = -ENOSPC;
         } else if (ins->objectid) {
+               if (!use_cluster && last_ptr) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->window_start = ins->objectid;
+                       spin_unlock(&last_ptr->lock);
+               }
                 ret = 0;
         }
   out:
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
+               spin_lock(&space_info->lock);
+               space_info->max_extent_size = max_extent_size;
+               spin_unlock(&space_info->lock);
                 ins->offset = max_extent_size;
+       }
         return ret;
   }
   
@@@ -7327,7 -7472,7 +7527,7 @@@ int btrfs_reserve_extent(struct btrfs_r
                          u64 empty_size, u64 hint_byte,
                          struct btrfs_key *ins, int is_data, int delalloc)
   {
-       bool final_tried = false;
+       bool final_tried = num_bytes == min_alloc_size;
         u64 flags;
         int ret;
   
@@@ -8929,7 -9074,7 +9129,7 @@@ again
          * back off and let this transaction commit
          */
         mutex_lock(&root->fs_info->ro_block_group_mutex);
-       if (trans->transaction->dirty_bg_run) {
+       if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                 u64 transid = trans->transid;
   
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@@ -9679,6 -9824,14 +9879,14 @@@ int btrfs_make_block_group(struct btrfs
   
         free_excluded_extents(root, cache);
   
+ #ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(root, cache)) {
+               u64 new_bytes_used = size - bytes_used;
+ 
+               bytes_used += new_bytes_used >> 1;
+               fragment_free_space(root, cache);
+       }
+ #endif
         /*
          * Call to ensure the corresponding space_info object is created and
          * assigned to our block group, but don't update its counters just yet.
diff --combined fs/btrfs/inode.c

index df6b93f,a3e0783..a018e47
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -310,13 -310,6 +310,13 @@@ static noinline int cow_file_range_inli
         btrfs_delalloc_release_metadata(inode, end + 1 - start);
         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
   out:
+ +      /*
+ +       * Don't forget to free the reserved space, as for inlined extent
+ +       * it won't count as data extent, free them directly here.
+ +       * And at reserve time, it's always aligned to page size, so
+ +       * just free one page here.
+ +       */
+ +      btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
         btrfs_free_path(path);
         btrfs_end_transaction(trans, root);
         return ret;
@@@ -1776,8 -1769,7 +1776,8 @@@ static void btrfs_clear_bit_hook(struc
   
                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                     && do_list && !(state->state & EXTENT_NORESERVE))
- -                      btrfs_free_reserved_data_space(inode, len);
+ +                      btrfs_free_reserved_data_space_noquota(inode,
+ +                                      state->start, len);
   
                 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
                                      root->fs_info->delalloc_batch);
@@@ -2000,8 -1992,7 +2000,8 @@@ again
                 goto again;
         }
   
- -      ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ +      ret = btrfs_delalloc_reserve_space(inode, page_start,
+ +                                         PAGE_CACHE_SIZE);
         if (ret) {
                 mapping_set_error(page->mapping, ret);
                 end_extent_writepage(page, ret, page_start, page_end);
@@@ -2128,16 -2119,6 +2128,16 @@@ static int insert_reserved_file_extent(
         ret = btrfs_alloc_reserved_file_extent(trans, root,
                                         root->root_key.objectid,
                                         btrfs_ino(inode), file_pos, &ins);
+ +      if (ret < 0)
+ +              goto out;
+ +      /*
+ +       * Release the reserved range from inode dirty range map, and
+ +       * move it to delayed ref codes, as now accounting only happens at
+ +       * commit_transaction() time.
+ +       */
+ +      btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+ +      ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+ +                      root->objectid, disk_bytenr, ram_bytes);
   out:
         btrfs_free_path(path);
   
@@@ -2845,14 -2826,6 +2845,14 @@@ static int btrfs_finish_ordered_io(stru
   
         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+ +
+ +              /*
+ +               * For mwrite(mmap + memset to write) case, we still reserve
+ +               * space for NOCOW range.
+ +               * As NOCOW won't cause a new delayed ref, just free the space
+ +               */
+ +              btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ +                                     ordered_extent->len);
                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                 if (nolock)
                         trans = btrfs_join_transaction_nolock(root);
@@@ -4655,17 -4628,14 +4655,17 @@@ int btrfs_truncate_page(struct inode *i
         if ((offset & (blocksize - 1)) == 0 &&
             (!len || ((len & (blocksize - 1)) == 0)))
                 goto out;
- -      ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ +      ret = btrfs_delalloc_reserve_space(inode,
+ +                      round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
         if (ret)
                 goto out;
   
   again:
         page = find_or_create_page(mapping, index, mask);
         if (!page) {
- -              btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ +              btrfs_delalloc_release_space(inode,
+ +                              round_down(from, PAGE_CACHE_SIZE),
+ +                              PAGE_CACHE_SIZE);
                 ret = -ENOMEM;
                 goto out;
         }
@@@ -4733,8 -4703,7 +4733,8 @@@
   
   out_unlock:
         if (ret)
- -              btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ +              btrfs_delalloc_release_space(inode, page_start,
+ +                                           PAGE_CACHE_SIZE);
         unlock_page(page);
         page_cache_release(page);
   out:
@@@ -5132,18 -5101,6 +5132,18 @@@ static void evict_inode_truncate_pages(
                 spin_unlock(&io_tree->lock);
   
                 lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ +
+ +              /*
+ +               * If still has DELALLOC flag, the extent didn't reach disk,
+ +               * and its reserved space won't be freed by delayed_ref.
+ +               * So we need to free its reserved space here.
+ +               * (Refer to comment in btrfs_invalidatepage, case 2)
+ +               *
+ +               * Note, end is the bytenr of last byte, so we need + 1 here.
+ +               */
+ +              if (state->state & EXTENT_DELALLOC)
+ +                      btrfs_qgroup_free_data(inode, start, end - start + 1);
+ +
                 clear_extent_bit(io_tree, start, end,
                                  EXTENT_LOCKED | EXTENT_DIRTY |
                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@@ -7677,7 -7634,7 +7677,7 @@@ unlock
                         spin_unlock(&BTRFS_I(inode)->lock);
                 }
   
- -              btrfs_free_reserved_data_space(inode, len);
+ +              btrfs_free_reserved_data_space(inode, start, len);
                 WARN_ON(dio_data->reserve < len);
                 dio_data->reserve -= len;
                 current->journal_info = dio_data;
@@@ -8467,7 -8424,7 +8467,7 @@@ static ssize_t btrfs_direct_IO(struct k
                         mutex_unlock(&inode->i_mutex);
                         relock = true;
                 }
- -              ret = btrfs_delalloc_reserve_space(inode, count);
+ +              ret = btrfs_delalloc_reserve_space(inode, offset, count);
                 if (ret)
                         goto out;
                 dio_data.outstanding_extents = div64_u64(count +
@@@ -8496,10 -8453,10 +8496,10 @@@
                 current->journal_info = NULL;
                 if (ret < 0 && ret != -EIOCBQUEUED) {
                         if (dio_data.reserve)
- -                              btrfs_delalloc_release_space(inode,
- -                                                      dio_data.reserve);
+ +                              btrfs_delalloc_release_space(inode, offset,
+ +                                                           dio_data.reserve);
                 } else if (ret >= 0 && (size_t)ret < count)
- -                      btrfs_delalloc_release_space(inode,
+ +                      btrfs_delalloc_release_space(inode, offset,
                                                      count - (size_t)ret);
         }
   out:
@@@ -8658,18 -8615,6 +8658,18 @@@ static void btrfs_invalidatepage(struc
                 }
         }
   
+ +      /*
+ +       * Qgroup reserved space handler
+ +       * Page here will be either
+ +       * 1) Already written to disk
+ +       *    In this case, its reserved space is released from data rsv map
+ +       *    and will be freed by delayed_ref handler finally.
+ +       *    So even we call qgroup_free_data(), it won't decrease reserved
+ +       *    space.
+ +       * 2) Not written to disk
+ +       *    This means the reserved space should be freed here.
+ +       */
+ +      btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
         if (!inode_evicting) {
                 clear_extent_bit(tree, page_start, page_end,
                                  EXTENT_LOCKED | EXTENT_DIRTY |
@@@ -8720,11 -8665,7 +8720,11 @@@ int btrfs_page_mkwrite(struct vm_area_s
         u64 page_end;
   
         sb_start_pagefault(inode->i_sb);
- -      ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ +      page_start = page_offset(page);
+ +      page_end = page_start + PAGE_CACHE_SIZE - 1;
+ +
+ +      ret = btrfs_delalloc_reserve_space(inode, page_start,
+ +                                         PAGE_CACHE_SIZE);
         if (!ret) {
                 ret = file_update_time(vma->vm_file);
                 reserved = 1;
@@@ -8743,6 -8684,8 +8743,6 @@@
   again:
         lock_page(page);
         size = i_size_read(inode);
- -      page_start = page_offset(page);
- -      page_end = page_start + PAGE_CACHE_SIZE - 1;
   
         if ((page->mapping != inode->i_mapping) ||
             (page_start >= size)) {
@@@ -8819,7 -8762,7 +8819,7 @@@ out_unlock
         }
         unlock_page(page);
   out:
- -      btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ +      btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
   out_noreserve:
         sb_end_pagefault(inode->i_sb);
         return ret;
@@@ -9108,7 -9051,6 +9108,7 @@@ void btrfs_destroy_inode(struct inode *
                         btrfs_put_ordered_extent(ordered);
                 }
         }
+ +      btrfs_qgroup_check_reserved_leak(inode);
         inode_tree_del(inode);
         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
   free:
@@@ -9745,6 -9687,7 +9745,7 @@@ static int __btrfs_prealloc_file_range(
         u64 cur_offset = start;
         u64 i_size;
         u64 cur_bytes;
+       u64 last_alloc = (u64)-1;
         int ret = 0;
         bool own_trans = true;
   
@@@ -9761,6 -9704,13 +9762,13 @@@
   
                 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
                 cur_bytes = max(cur_bytes, min_size);
+               /*
+                * If we are severely fragmented we could end up with really
+                * small allocations, so if the allocator is returning small
+                * chunks lets make its job easier by only searching for those
+                * sized chunks.
+                */
+               cur_bytes = min(cur_bytes, last_alloc);
                 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
                                            *alloc_hint, &ins, 1, 0);
                 if (ret) {
@@@ -9769,6 -9719,7 +9777,7 @@@
                         break;
                 }
   
+               last_alloc = ins.offset;
                 ret = insert_reserved_file_extent(trans, inode,
                                                   cur_offset, ins.objectid,
                                                   ins.offset, ins.offset,
diff --combined fs/btrfs/transaction.c

index e377d7b,222f9a9..418c6a2
--- 1/fs/btrfs/transaction.c
--- 2/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@@ -232,15 -232,16 +232,16 @@@ loop
         extwriter_counter_init(cur_trans, type);
         init_waitqueue_head(&cur_trans->writer_wait);
         init_waitqueue_head(&cur_trans->commit_wait);
+       init_waitqueue_head(&cur_trans->pending_wait);
         cur_trans->state = TRANS_STATE_RUNNING;
         /*
          * One for this trans handle, one so it will live on until we
          * commit the transaction.
          */
         atomic_set(&cur_trans->use_count, 2);
-       cur_trans->have_free_bgs = 0;
+       atomic_set(&cur_trans->pending_ordered, 0);
+       cur_trans->flags = 0;
         cur_trans->start_time = get_seconds();
-       cur_trans->dirty_bg_run = 0;
   
         memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
   
@@@ -266,7 -267,6 +267,6 @@@
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
         INIT_LIST_HEAD(&cur_trans->pending_chunks);
         INIT_LIST_HEAD(&cur_trans->switch_commits);
-       INIT_LIST_HEAD(&cur_trans->pending_ordered);
         INIT_LIST_HEAD(&cur_trans->dirty_bgs);
         INIT_LIST_HEAD(&cur_trans->io_bgs);
         INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@@ -480,10 -480,13 +480,10 @@@ start_transaction(struct btrfs_root *ro
          * the appropriate flushing if need be.
          */
         if (num_items > 0 && root != root->fs_info->chunk_root) {
- -              if (root->fs_info->quota_enabled &&
- -                  is_fstree(root->root_key.objectid)) {
- -                      qgroup_reserved = num_items * root->nodesize;
- -                      ret = btrfs_qgroup_reserve(root, qgroup_reserved);
- -                      if (ret)
- -                              return ERR_PTR(ret);
- -              }
+ +              qgroup_reserved = num_items * root->nodesize;
+ +              ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+ +              if (ret)
+ +                      return ERR_PTR(ret);
   
                 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                 /*
@@@ -544,12 -547,10 +544,11 @@@ again
         h->transaction = cur_trans;
         h->root = root;
         h->use_count = 1;
+ +
         h->type = type;
         h->can_flush_pending_bgs = true;
         INIT_LIST_HEAD(&h->qgroup_ref_list);
         INIT_LIST_HEAD(&h->new_bgs);
-       INIT_LIST_HEAD(&h->ordered);
   
         smp_mb();
         if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@@ -566,6 -567,7 +565,6 @@@
                 h->bytes_reserved = num_bytes;
                 h->reloc_reserved = reloc_reserved;
         }
- -      h->qgroup_reserved = qgroup_reserved;
   
   got_it:
         btrfs_record_root_in_trans(h, root);
@@@ -583,7 -585,8 +582,7 @@@ alloc_fail
                 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                         num_bytes);
   reserve_fail:
- -      if (qgroup_reserved)
- -              btrfs_qgroup_free(root, qgroup_reserved);
+ +      btrfs_qgroup_free_meta(root, qgroup_reserved);
         return ERR_PTR(ret);
   }
   
@@@ -780,12 -783,6 +779,6 @@@ static int __btrfs_end_transaction(stru
         if (!list_empty(&trans->new_bgs))
                 btrfs_create_pending_block_groups(trans, root);
   
-       if (!list_empty(&trans->ordered)) {
-               spin_lock(&info->trans_lock);
-               list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
-               spin_unlock(&info->trans_lock);
-       }
- 
         trans->delayed_ref_updates = 0;
         if (!trans->sync) {
                 must_run_delayed_refs =
@@@ -801,6 -798,15 +794,6 @@@
                         must_run_delayed_refs = 2;
         }
   
- -      if (trans->qgroup_reserved) {
- -              /*
- -               * the same root has to be passed here between start_transaction
- -               * and end_transaction. Subvolume quota depends on this.
- -               */
- -              btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
- -              trans->qgroup_reserved = 0;
- -      }
- -
         btrfs_trans_release_metadata(trans, root);
         trans->block_rsv = NULL;
   
@@@ -1218,7 -1224,6 +1211,7 @@@ static noinline int commit_fs_roots(str
                         spin_lock(&fs_info->fs_roots_radix_lock);
                         if (err)
                                 break;
+ +                      btrfs_qgroup_free_meta_all(root);
                 }
         }
         spin_unlock(&fs_info->fs_roots_radix_lock);
@@@ -1776,25 -1781,10 +1769,10 @@@ static inline void btrfs_wait_delalloc_
   }
   
   static inline void
- btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
-                          struct btrfs_fs_info *fs_info)
+ btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
   {
-       struct btrfs_ordered_extent *ordered;
- 
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
- 
-               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-                                                  &ordered->flags));
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
+       wait_event(cur_trans->pending_wait,
+                  atomic_read(&cur_trans->pending_ordered) == 0);
   }
   
   int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@@ -1823,6 -1813,10 +1801,6 @@@
   
         btrfs_trans_release_metadata(trans, root);
         trans->block_rsv = NULL;
- -      if (trans->qgroup_reserved) {
- -              btrfs_qgroup_free(root, trans->qgroup_reserved);
- -              trans->qgroup_reserved = 0;
- -      }
   
         cur_trans = trans->transaction;
   
@@@ -1842,7 -1836,7 +1820,7 @@@
                 return ret;
         }
   
-       if (!cur_trans->dirty_bg_run) {
+       if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
                 int run_it = 0;
   
                 /* this mutex is also taken before trying to set
@@@ -1851,18 -1845,17 +1829,17 @@@
                  * after a extents from that block group have been
                  * allocated for cache files.  btrfs_set_block_group_ro
                  * will wait for the transaction to commit if it
-                * finds dirty_bg_run = 1
+                * finds BTRFS_TRANS_DIRTY_BG_RUN set.
                  *
-                * The dirty_bg_run flag is also used to make sure only
-                * one process starts all the block group IO.  It wouldn't
+                * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+                * only one process starts all the block group IO.  It wouldn't
                  * hurt to have more than one go through, but there's no
                  * real advantage to it either.
                  */
                 mutex_lock(&root->fs_info->ro_block_group_mutex);
-               if (!cur_trans->dirty_bg_run) {
+               if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+                                     &cur_trans->flags))
                         run_it = 1;
-                       cur_trans->dirty_bg_run = 1;
-               }
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
   
                 if (run_it)
@@@ -1874,7 -1867,6 +1851,6 @@@
         }
   
         spin_lock(&root->fs_info->trans_lock);
-       list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
         if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                 spin_unlock(&root->fs_info->trans_lock);
                 atomic_inc(&cur_trans->use_count);
@@@ -1933,7 -1925,7 +1909,7 @@@
   
         btrfs_wait_delalloc_flush(root->fs_info);
   
-       btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+       btrfs_wait_pending_ordered(cur_trans);
   
         btrfs_scrub_pause(root);
         /*
@@@ -2133,7 -2125,7 +2109,7 @@@
   
         btrfs_finish_extent_commit(trans, root);
   
-       if (cur_trans->have_free_bgs)
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
                 btrfs_clear_space_info_full(root->fs_info);
   
         root->fs_info->last_trans_committed = cur_trans->transid;
@@@ -2175,6 -2167,10 +2151,6 @@@ cleanup_transaction
         btrfs_trans_release_metadata(trans, root);
         btrfs_trans_release_chunk_metadata(trans);
         trans->block_rsv = NULL;
- -      if (trans->qgroup_reserved) {
- -              btrfs_qgroup_free(root, trans->qgroup_reserved);
- -              trans->qgroup_reserved = 0;
- -      }
         btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
         if (current->journal_info == trans)
                 current->journal_info = NULL;
diff --combined fs/btrfs/transaction.h

index 54b7dea,30ae750..b05b2f6
--- 1/fs/btrfs/transaction.h
--- 2/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@@ -32,6 -32,10 +32,10 @@@ enum btrfs_trans_state 
         TRANS_STATE_MAX                 = 6,
   };
   
+ #define BTRFS_TRANS_HAVE_FREE_BGS     0
+ #define BTRFS_TRANS_DIRTY_BG_RUN      1
+ #define BTRFS_TRANS_CACHE_ENOSPC      2
+ 
   struct btrfs_transaction {
         u64 transid;
         /*
@@@ -46,11 -50,9 +50,9 @@@
          */
         atomic_t num_writers;
         atomic_t use_count;
+       atomic_t pending_ordered;
   
-       /*
-        * true if there is free bgs operations in this transaction
-        */
-       int have_free_bgs;
+       unsigned long flags;
   
         /* Be protected by fs_info->trans_lock when we want to change it. */
         enum btrfs_trans_state state;
@@@ -59,9 -61,9 +61,9 @@@
         unsigned long start_time;
         wait_queue_head_t writer_wait;
         wait_queue_head_t commit_wait;
+       wait_queue_head_t pending_wait;
         struct list_head pending_snapshots;
         struct list_head pending_chunks;
-       struct list_head pending_ordered;
         struct list_head switch_commits;
         struct list_head dirty_bgs;
         struct list_head io_bgs;
@@@ -80,7 -82,6 +82,6 @@@
         spinlock_t dropped_roots_lock;
         struct btrfs_delayed_ref_root delayed_refs;
         int aborted;
-       int dirty_bg_run;
   };
   
   #define __TRANS_FREEZABLE     (1U << 0)
@@@ -107,6 -108,7 +108,6 @@@ struct btrfs_trans_handle 
         u64 transid;
         u64 bytes_reserved;
         u64 chunk_bytes_reserved;
- -      u64 qgroup_reserved;
         unsigned long use_count;
         unsigned long blocks_reserved;
         unsigned long blocks_used;
@@@ -128,7 -130,6 +129,6 @@@
          */
         struct btrfs_root *root;
         struct seq_list delayed_ref_elem;
-       struct list_head ordered;
         struct list_head qgroup_ref_list;
         struct list_head new_bgs;
   };
author	Chris Mason <clm@fb.com>
	Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
committer	Chris Mason <clm@fb.com>
	Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
		1	2
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/transaction.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/transaction.h	patch \|	diff1 \|	diff2 \|	blob \| history