Btrfs: remove empty block groups automatically
authorJosef Bacik <jbacik@fb.com>
Thu, 18 Sep 2014 15:20:02 +0000 (11:20 -0400)
committerChris Mason <clm@fb.com>
Tue, 23 Sep 2014 00:13:21 +0000 (17:13 -0700)
One problem that has plagued us is that a user will use up all of his space with
data, remove a bunch of that data, and then try to create a bunch of small files
and run out of space.  This happens because all the chunks were allocated for
data since the metadata requirements were so low.  But now there's a bunch of
empty data block groups and not enough metadata space to do anything.  This
patch solves this problem by automatically deleting empty block groups.  If we
notice the used count go down to 0 when deleting or on mount notice that a block
group has a used count of 0 then we will queue it to be deleted.

When the cleaner thread runs we will double check to make sure the block group
is still empty and then we will delete it.  This patch has the side effect of no
longer having a bunch of BUG_ON()'s in the chunk delete code, which will be
helpful for both this and relocate.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/tests/free-space-tests.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 51ff3f8..089f6da 100644 (file)
@@ -1298,8 +1298,8 @@ struct btrfs_block_group_cache {
         */
        struct list_head cluster_list;
 
-       /* For delayed block group creation */
-       struct list_head new_bg_list;
+       /* For delayed block group creation or deletion of empty block groups */
+       struct list_head bg_list;
 };
 
 /* delayed seq elem */
@@ -1568,6 +1568,7 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
+       int open;
 
        u64 total_pinned;
 
@@ -1717,6 +1718,9 @@ struct btrfs_fs_info {
 
        /* Used to reclaim the metadata space in the background. */
        struct work_struct async_reclaim_work;
+
+       spinlock_t unused_bgs_lock;
+       struct list_head unused_bgs;
 };
 
 struct btrfs_subvolume_writers {
@@ -3344,6 +3348,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
index 48794f9..4780e66 100644 (file)
@@ -1769,6 +1769,7 @@ static int cleaner_kthread(void *arg)
                }
 
                btrfs_run_delayed_iputs(root);
+               btrfs_delete_unused_bgs(root->fs_info);
                again = btrfs_clean_one_deleted_snapshot(root);
                mutex_unlock(&root->fs_info->cleaner_mutex);
 
@@ -2230,6 +2231,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->super_lock);
        spin_lock_init(&fs_info->qgroup_op_lock);
        spin_lock_init(&fs_info->buffer_lock);
+       spin_lock_init(&fs_info->unused_bgs_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
@@ -2239,6 +2241,7 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+       INIT_LIST_HEAD(&fs_info->unused_bgs);
        btrfs_mapping_init(&fs_info->mapping_tree);
        btrfs_init_block_rsv(&fs_info->global_block_rsv,
                             BTRFS_BLOCK_RSV_GLOBAL);
@@ -2977,6 +2980,8 @@ retry_root_backup:
                fs_info->update_uuid_tree_gen = 1;
        }
 
+       fs_info->open = 1;
+
        return 0;
 
 fail_qgroup:
@@ -3688,6 +3693,7 @@ void close_ctree(struct btrfs_root *root)
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        btrfs_stop_all_workers(fs_info);
 
+       fs_info->open = 0;
        free_root_pointers(fs_info, 1);
 
        iput(fs_info->btree_inode);
index b30ddb4..28a27d5 100644 (file)
@@ -5433,6 +5433,20 @@ static int update_block_group(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
+
+                       /*
+                        * No longer have used bytes in this block group, queue
+                        * it for deletion.
+                        */
+                       if (old_val == 0) {
+                               spin_lock(&info->unused_bgs_lock);
+                               if (list_empty(&cache->bg_list)) {
+                                       btrfs_get_block_group(cache);
+                                       list_add_tail(&cache->bg_list,
+                                                     &info->unused_bgs);
+                               }
+                               spin_unlock(&info->unused_bgs_lock);
+                       }
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->pinned += num_bytes;
                        cache->space_info->bytes_pinned += num_bytes;
@@ -8855,6 +8869,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        }
        up_write(&info->commit_root_sem);
 
+       spin_lock(&info->unused_bgs_lock);
+       while (!list_empty(&info->unused_bgs)) {
+               block_group = list_first_entry(&info->unused_bgs,
+                                              struct btrfs_block_group_cache,
+                                              bg_list);
+               list_del_init(&block_group->bg_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&info->unused_bgs_lock);
+
        spin_lock(&info->block_group_cache_lock);
        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
                block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8989,7 +9013,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        init_rwsem(&cache->data_rwsem);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
-       INIT_LIST_HEAD(&cache->new_bg_list);
+       INIT_LIST_HEAD(&cache->bg_list);
        btrfs_init_free_space_ctl(cache);
 
        return cache;
@@ -9130,8 +9154,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                __link_block_group(space_info, cache);
 
                set_avail_alloc_bits(root->fs_info, cache->flags);
-               if (btrfs_chunk_readonly(root, cache->key.objectid))
+               if (btrfs_chunk_readonly(root, cache->key.objectid)) {
                        set_block_group_ro(cache, 1);
+               } else if (btrfs_block_group_used(&cache->item) == 0) {
+                       spin_lock(&info->unused_bgs_lock);
+                       /* Should always be true but just in case. */
+                       if (list_empty(&cache->bg_list)) {
+                               btrfs_get_block_group(cache);
+                               list_add_tail(&cache->bg_list,
+                                             &info->unused_bgs);
+                       }
+                       spin_unlock(&info->unused_bgs_lock);
+               }
        }
 
        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -9172,10 +9206,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        int ret = 0;
 
-       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
-                                new_bg_list) {
-               list_del_init(&block_group->new_bg_list);
-
+       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+               list_del_init(&block_group->bg_list);
                if (ret)
                        continue;
 
@@ -9261,7 +9293,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
        __link_block_group(cache->space_info, cache);
 
-       list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+       list_add_tail(&cache->bg_list, &trans->new_bgs);
 
        set_avail_alloc_bits(extent_root->fs_info, type);
 
@@ -9430,6 +9462,101 @@ out:
        return ret;
 }
 
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_space_info *space_info;
+       struct btrfs_root *root = fs_info->extent_root;
+       struct btrfs_trans_handle *trans;
+       int ret = 0;
+
+       if (!fs_info->open)
+               return;
+
+       spin_lock(&fs_info->unused_bgs_lock);
+       while (!list_empty(&fs_info->unused_bgs)) {
+               u64 start, end;
+
+               block_group = list_first_entry(&fs_info->unused_bgs,
+                                              struct btrfs_block_group_cache,
+                                              bg_list);
+               space_info = block_group->space_info;
+               list_del_init(&block_group->bg_list);
+               if (ret || btrfs_mixed_space_info(space_info)) {
+                       btrfs_put_block_group(block_group);
+                       continue;
+               }
+               spin_unlock(&fs_info->unused_bgs_lock);
+
+               /* Don't want to race with allocators so take the groups_sem */
+               down_write(&space_info->groups_sem);
+               spin_lock(&block_group->lock);
+               if (block_group->reserved ||
+                   btrfs_block_group_used(&block_group->item) ||
+                   block_group->ro) {
+                       /*
+                        * We want to bail if we made new allocations or have
+                        * outstanding allocations in this block group.  We do
+                        * the ro check in case balance is currently acting on
+                        * this block group.
+                        */
+                       spin_unlock(&block_group->lock);
+                       up_write(&space_info->groups_sem);
+                       goto next;
+               }
+               spin_unlock(&block_group->lock);
+
+               /* We don't want to force the issue, only flip if it's ok. */
+               ret = set_block_group_ro(block_group, 0);
+               up_write(&space_info->groups_sem);
+               if (ret < 0) {
+                       ret = 0;
+                       goto next;
+               }
+
+               /*
+                * Want to do this before we do anything else so we can recover
+                * properly if we fail to join the transaction.
+                */
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       btrfs_set_block_group_rw(root, block_group);
+                       ret = PTR_ERR(trans);
+                       goto next;
+               }
+
+               /*
+                * We could have pending pinned extents for this block group,
+                * just delete them, we don't care about them anymore.
+                */
+               start = block_group->key.objectid;
+               end = start + block_group->key.offset - 1;
+               clear_extent_bits(&fs_info->freed_extents[0], start, end,
+                                 EXTENT_DIRTY, GFP_NOFS);
+               clear_extent_bits(&fs_info->freed_extents[1], start, end,
+                                 EXTENT_DIRTY, GFP_NOFS);
+
+               /* Reset pinned so btrfs_put_block_group doesn't complain */
+               block_group->pinned = 0;
+
+               /*
+                * Btrfs_remove_chunk will abort the transaction if things go
+                * horribly wrong.
+                */
+               ret = btrfs_remove_chunk(trans, root,
+                                        block_group->key.objectid);
+               btrfs_end_transaction(trans, root);
+next:
+               btrfs_put_block_group(block_group);
+               spin_lock(&fs_info->unused_bgs_lock);
+       }
+       spin_unlock(&fs_info->unused_bgs_lock);
+}
+
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_space_info *space_info;
index d78ae10..2299bfd 100644 (file)
@@ -45,7 +45,7 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
        spin_lock_init(&cache->lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
-       INIT_LIST_HEAD(&cache->new_bg_list);
+       INIT_LIST_HEAD(&cache->bg_list);
 
        btrfs_init_free_space_ctl(cache);
 
index 63e6327..f27c0f7 100644 (file)
@@ -2568,58 +2568,49 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
        return ret;
 }
 
-static int btrfs_relocate_chunk(struct btrfs_root *root,
-                        u64 chunk_tree, u64 chunk_objectid,
-                        u64 chunk_offset)
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, u64 chunk_offset)
 {
        struct extent_map_tree *em_tree;
-       struct btrfs_root *extent_root;
-       struct btrfs_trans_handle *trans;
-       struct btrfs_device *device;
        struct extent_map *em;
+       struct btrfs_root *extent_root = root->fs_info->extent_root;
        struct map_lookup *map;
        u64 dev_extent_len = 0;
-       int ret;
-       int i;
+       u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+       u64 chunk_tree = root->fs_info->chunk_root->objectid;
+       int i, ret = 0;
 
+       /* Just in case */
        root = root->fs_info->chunk_root;
-       extent_root = root->fs_info->extent_root;
        em_tree = &root->fs_info->mapping_tree.map_tree;
 
-       ret = btrfs_can_relocate(extent_root, chunk_offset);
-       if (ret)
-               return -ENOSPC;
-
-       /* step one, relocate all the extents inside this chunk */
-       ret = btrfs_relocate_block_group(extent_root, chunk_offset);
-       if (ret)
-               return ret;
-
-       trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans)) {
-               ret = PTR_ERR(trans);
-               btrfs_std_error(root->fs_info, ret);
-               return ret;
-       }
-
-       /*
-        * step two, delete the device extents and the
-        * chunk tree entries
-        */
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
        read_unlock(&em_tree->lock);
 
-       BUG_ON(!em || em->start > chunk_offset ||
-              em->start + em->len < chunk_offset);
+       if (!em || em->start > chunk_offset ||
+           em->start + em->len < chunk_offset) {
+               /*
+                * This is a logic error, but we don't want to just rely on the
+                * user having built with ASSERT enabled, so if ASSERT doens't
+                * do anything we still error out.
+                */
+               ASSERT(0);
+               if (em)
+                       free_extent_map(em);
+               return -EINVAL;
+       }
        map = (struct map_lookup *)em->bdev;
 
        for (i = 0; i < map->num_stripes; i++) {
-               device = map->stripes[i].dev;
+               struct btrfs_device *device = map->stripes[i].dev;
                ret = btrfs_free_dev_extent(trans, device,
                                            map->stripes[i].physical,
                                            &dev_extent_len);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
 
                if (device->bytes_used > 0) {
                        lock_chunks(root);
@@ -2634,23 +2625,34 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 
                if (map->stripes[i].dev) {
                        ret = btrfs_update_device(trans, map->stripes[i].dev);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, root, ret);
+                               goto out;
+                       }
                }
        }
        ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
                               chunk_offset);
-
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
 
        trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
 
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
        }
 
        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, extent_root, ret);
+               goto out;
+       }
 
        write_lock(&em_tree->lock);
        remove_extent_mapping(em_tree, em);
@@ -2658,11 +2660,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 
        /* once for the tree */
        free_extent_map(em);
+out:
        /* once for us */
        free_extent_map(em);
+       return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+                        u64 chunk_tree, u64 chunk_objectid,
+                        u64 chunk_offset)
+{
+       struct btrfs_root *extent_root;
+       struct btrfs_trans_handle *trans;
+       int ret;
+
+       root = root->fs_info->chunk_root;
+       extent_root = root->fs_info->extent_root;
+
+       ret = btrfs_can_relocate(extent_root, chunk_offset);
+       if (ret)
+               return -ENOSPC;
+
+       /* step one, relocate all the extents inside this chunk */
+       ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+       if (ret)
+               return ret;
 
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               btrfs_std_error(root->fs_info, ret);
+               return ret;
+       }
+
+       /*
+        * step two, delete the device extents and the
+        * chunk tree entries
+        */
+       ret = btrfs_remove_chunk(trans, root, chunk_offset);
        btrfs_end_transaction(trans, root);
-       return 0;
+       return ret;
 }
 
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
index 91998bc..08980fa 100644 (file)
@@ -463,6 +463,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
                                u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, u64 chunk_offset);
 
 static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
 {