Btrfs: don't continue setting up space cache when enospc

[cascardo/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 9f96042..3185c45 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -332,6 +332,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
                 kfree(ctl);
  }
  
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+                               struct btrfs_block_group_cache *block_group)
+{
+       u64 start = block_group->key.objectid;
+       u64 len = block_group->key.offset;
+       u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+               root->nodesize : root->sectorsize;
+       u64 step = chunk << 1;
+
+       while (len > chunk) {
+               btrfs_remove_free_space(block_group, start, chunk);
+               start += step;
+               if (len < step)
+                       len = 0;
+               else
+                       len -= step;
+       }
+}
+#endif
+
  /*
   * this is only called by cache_block_group, since we could have freed extents
   * we need to check the pinned_extents for any extents that can't be used yet
@@ -388,6 +409,7 @@ static noinline void caching_thread(struct btrfs_work *work)
         u64 last = 0;
         u32 nritems;
         int ret = -ENOMEM;
+       bool wakeup = true;
  
         caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
@@ -400,6 +422,15 @@ static noinline void caching_thread(struct btrfs_work *work)
  
         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       /*
+        * If we're fragmenting we don't want to make anybody think we can
+        * allocate from this block group until we've had a chance to fragment
+        * the free space.
+        */
+       if (btrfs_should_fragment_free_space(extent_root, block_group))
+               wakeup = false;
+#endif
         /*
          * We don't want to deadlock with somebody trying to allocate a new
          * extent for the extent root while also trying to search the extent
@@ -441,7 +472,8 @@ next:
  
                         if (need_resched() ||
                             rwsem_is_contended(&fs_info->commit_root_sem)) {
-                               caching_ctl->progress = last;
+                               if (wakeup)
+                                       caching_ctl->progress = last;
                                 btrfs_release_path(path);
                                 up_read(&fs_info->commit_root_sem);
                                 mutex_unlock(&caching_ctl->mutex);
@@ -464,7 +496,8 @@ next:
                         key.offset = 0;
                         key.type = BTRFS_EXTENT_ITEM_KEY;
  
-                       caching_ctl->progress = last;
+                       if (wakeup)
+                               caching_ctl->progress = last;
                         btrfs_release_path(path);
                         goto next;
                 }
@@ -491,7 +524,8 @@ next:
  
                         if (total_found > (1024 * 1024 * 2)) {
                                 total_found = 0;
-                               wake_up(&caching_ctl->wait);
+                               if (wakeup)
+                                       wake_up(&caching_ctl->wait);
                         }
                 }
                 path->slots[0]++;
@@ -501,13 +535,27 @@ next:
         total_found += add_new_free_space(block_group, fs_info, last,
                                           block_group->key.objectid +
                                           block_group->key.offset);
-       caching_ctl->progress = (u64)-1;
-
         spin_lock(&block_group->lock);
         block_group->caching_ctl = NULL;
         block_group->cached = BTRFS_CACHE_FINISHED;
         spin_unlock(&block_group->lock);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+               u64 bytes_used;
+
+               spin_lock(&block_group->space_info->lock);
+               spin_lock(&block_group->lock);
+               bytes_used = block_group->key.offset -
+                       btrfs_block_group_used(&block_group->item);
+               block_group->space_info->bytes_used += bytes_used >> 1;
+               spin_unlock(&block_group->lock);
+               spin_unlock(&block_group->space_info->lock);
+               fragment_free_space(extent_root, block_group);
+       }
+#endif
+
+       caching_ctl->progress = (u64)-1;
  err:
         btrfs_free_path(path);
         up_read(&fs_info->commit_root_sem);
@@ -607,6 +655,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                         }
                 }
                 spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+               if (ret == 1 &&
+                   btrfs_should_fragment_free_space(fs_info->extent_root,
+                                                    cache)) {
+                       u64 bytes_used;
+
+                       spin_lock(&cache->space_info->lock);
+                       spin_lock(&cache->lock);
+                       bytes_used = cache->key.offset -
+                               btrfs_block_group_used(&cache->item);
+                       cache->space_info->bytes_used += bytes_used >> 1;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
+                       fragment_free_space(fs_info->extent_root, cache);
+               }
+#endif
                 mutex_unlock(&caching_ctl->mutex);
  
                 wake_up(&caching_ctl->wait);
@@ -2828,6 +2892,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
         struct btrfs_delayed_ref_head *head;
         int ret;
         int run_all = count == (unsigned long)-1;
+       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
  
         /* We'll clean this up in btrfs_cleanup_transaction */
         if (trans->aborted)
@@ -2844,6 +2909,7 @@ again:
  #ifdef SCRAMBLE_DELAYED_REFS
         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
  #endif
+       trans->can_flush_pending_bgs = false;
         ret = __btrfs_run_delayed_refs(trans, root, count);
         if (ret < 0) {
                 btrfs_abort_transaction(trans, root, ret);
@@ -2893,6 +2959,7 @@ again:
         }
  out:
         assert_qgroups_uptodate(trans);
+       trans->can_flush_pending_bgs = can_flush_pending_bgs;
         return 0;
  }
  
@@ -3335,6 +3402,15 @@ again:
         }
         spin_unlock(&block_group->lock);
  
+       /*
+        * We hit an ENOSPC when setting up the cache in this transaction, just
+        * skip doing the setup, we've already cleared the cache so we're safe.
+        */
+       if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+               ret = -ENOSPC;
+               goto out_put;
+       }
+
         /*
          * Try to preallocate enough space based on how big the block group is.
          * Keep in mind this has to include any pinned space which could end up
@@ -3355,8 +3431,18 @@ again:
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                               num_pages, num_pages,
                                               &alloc_hint);
+       /*
+        * Our cache requires contiguous chunks so that we don't modify a bunch
+        * of metadata or split extents when writing the cache out, which means
+        * we can enospc if we are heavily fragmented in addition to just normal
+        * out of space conditions.  So if we hit this just skip setting up any
+        * other block groups for this transaction, maybe we'll unpin enough
+        * space the next time around.
+        */
         if (!ret)
                 dcs = BTRFS_DC_SETUP;
+       else if (ret == -ENOSPC)
+               set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
         btrfs_free_reserved_data_space(inode, num_pages);
  
  out_put:
@@ -3743,6 +3829,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
         found->bytes_readonly = 0;
         found->bytes_may_use = 0;
         found->full = 0;
+       found->max_extent_size = 0;
         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
         found->chunk_alloc = 0;
         found->flush = 0;
@@ -3819,7 +3906,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
         u64 num_devices = root->fs_info->fs_devices->rw_devices;
         u64 target;
-       u64 tmp;
+       u64 raid_type;
+       u64 allowed = 0;
  
         /*
          * see if restripe for this chunk_type is in progress, if so
@@ -3837,31 +3925,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
         spin_unlock(&root->fs_info->balance_lock);
  
         /* First, mask out the RAID levels which aren't possible */
-       if (num_devices == 1)
-               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
-                          BTRFS_BLOCK_GROUP_RAID5);
-       if (num_devices < 3)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID6;
-       if (num_devices < 4)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
-       tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-                      BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
-                      BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
-       flags &= ~tmp;
-
-       if (tmp & BTRFS_BLOCK_GROUP_RAID6)
-               tmp = BTRFS_BLOCK_GROUP_RAID6;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
-               tmp = BTRFS_BLOCK_GROUP_RAID5;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
-               tmp = BTRFS_BLOCK_GROUP_RAID10;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
-               tmp = BTRFS_BLOCK_GROUP_RAID1;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
-               tmp = BTRFS_BLOCK_GROUP_RAID0;
-
-       return extended_to_chunk(flags | tmp);
+       for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+               if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+                       allowed |= btrfs_raid_group[raid_type];
+       }
+       allowed &= flags;
+
+       if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+               allowed = BTRFS_BLOCK_GROUP_RAID6;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+               allowed = BTRFS_BLOCK_GROUP_RAID5;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+               allowed = BTRFS_BLOCK_GROUP_RAID10;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+               allowed = BTRFS_BLOCK_GROUP_RAID1;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+               allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+       flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       return extended_to_chunk(flags | allowed);
  }
  
  static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -4003,7 +4086,8 @@ commit_trans:
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
                         if (have_pinned_space >= 0 ||
-                           trans->transaction->have_free_bgs ||
+                           test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+                                    &trans->transaction->flags) ||
                             need_commit > 0) {
                                 ret = btrfs_commit_transaction(trans, root);
                                 if (ret)
@@ -4306,7 +4390,8 @@ out:
          * the block groups that were made dirty during the lifetime of the
          * transaction.
          */
-       if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+       if (trans->can_flush_pending_bgs &&
+           trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
                 btrfs_create_pending_block_groups(trans, trans->root);
                 btrfs_trans_release_chunk_metadata(trans);
         }
@@ -4887,13 +4972,9 @@ static struct btrfs_block_rsv *get_block_rsv(
  {
         struct btrfs_block_rsv *block_rsv = NULL;
  
-       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->csum_root && trans->adding_csums)
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->uuid_root)
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+           (root == root->fs_info->csum_root && trans->adding_csums) ||
+            (root == root->fs_info->uuid_root))
                 block_rsv = trans->block_rsv;
  
         if (!block_rsv)
@@ -6097,6 +6178,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
                 space_info->bytes_pinned -= len;
+               space_info->max_extent_size = 0;
                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
                 if (cache->ro) {
                         space_info->bytes_readonly += len;
@@ -6853,6 +6935,29 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
                 return -ENOSPC;
         }
  
+       /*
+        * If our free space is heavily fragmented we may not be able to make
+        * big contiguous allocations, so instead of doing the expensive search
+        * for free space, simply return ENOSPC with our max_extent_size so we
+        * can go ahead and search for a more manageable chunk.
+        *
+        * If our max_extent_size is large enough for our allocation simply
+        * disable clustering since we will likely not be able to find enough
+        * space to create a cluster and induce latency trying.
+        */
+       if (unlikely(space_info->max_extent_size)) {
+               spin_lock(&space_info->lock);
+               if (space_info->max_extent_size &&
+                   num_bytes > space_info->max_extent_size) {
+                       ins->offset = space_info->max_extent_size;
+                       spin_unlock(&space_info->lock);
+                       return -ENOSPC;
+               } else if (space_info->max_extent_size) {
+                       use_cluster = false;
+               }
+               spin_unlock(&space_info->lock);
+       }
+
         /*
          * If the space info is for both data and metadata it means we have a
          * small filesystem and we can't use the clustering stuff.
@@ -7226,8 +7331,12 @@ loop:
                 ret = 0;
         }
  out:
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
+               spin_lock(&space_info->lock);
+               space_info->max_extent_size = max_extent_size;
+               spin_unlock(&space_info->lock);
                 ins->offset = max_extent_size;
+       }
         return ret;
  }
  
@@ -7276,7 +7385,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
                          u64 empty_size, u64 hint_byte,
                          struct btrfs_key *ins, int is_data, int delalloc)
  {
-       bool final_tried = false;
+       bool final_tried = num_bytes == min_alloc_size;
         u64 flags;
         int ret;
  
@@ -8271,10 +8380,11 @@ skip:
                         ret = account_shared_subtree(trans, root, next,
                                                      generation, level - 1);
                         if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "Error "
                                         "%d accounting shared subtree. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                         }
                 }
                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
@@ -8363,10 +8473,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                         BUG_ON(ret); /* -ENOMEM */
                         ret = account_leaf_items(trans, root, eb);
                         if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "error "
                                         "%d accounting leaf items. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                         }
                 }
                 /* make block locked assertion in clean_tree_block happy */
@@ -8688,7 +8799,7 @@ out:
         if (!for_reloc && root_dropped == false)
                 btrfs_add_dead_root(root);
         if (err && err != -EAGAIN)
-               btrfs_std_error(root->fs_info, err);
+               btrfs_std_error(root->fs_info, err, NULL);
         return err;
  }
  
@@ -8876,7 +8987,7 @@ again:
          * back off and let this transaction commit
          */
         mutex_lock(&root->fs_info->ro_block_group_mutex);
-       if (trans->transaction->dirty_bg_run) {
+       if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                 u64 transid = trans->transid;
  
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -9560,7 +9671,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
         struct btrfs_block_group_item item;
         struct btrfs_key key;
         int ret = 0;
+       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
  
+       trans->can_flush_pending_bgs = false;
         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
                 if (ret)
                         goto next;
@@ -9581,6 +9694,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
  next:
                 list_del_init(&block_group->bg_list);
         }
+       trans->can_flush_pending_bgs = can_flush_pending_bgs;
  }
  
  int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -9623,6 +9737,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
  
         free_excluded_extents(root, cache);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(root, cache)) {
+               u64 new_bytes_used = size - bytes_used;
+
+               bytes_used += new_bytes_used >> 1;
+               fragment_free_space(root, cache);
+       }
+#endif
         /*
          * Call to ensure the corresponding space_info object is created and
          * assigned to our block group, but don't update its counters just yet.
@@ -10363,8 +10485,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
  {
         percpu_counter_dec(&root->subv_writers->counter);
         /*
-        * Make sure counter is updated before we wake up
-        * waiters.
+        * Make sure counter is updated before we wake up waiters.
          */
         smp_mb();
         if (waitqueue_active(&root->subv_writers->wait))