Btrfs: keep dropped roots in cache until transaction commit
[cascardo/linux.git] / fs / btrfs / extent-tree.c
index 1c2bd17..9f96042 100644 (file)
@@ -1316,8 +1316,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
-                                         struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
                                          struct btrfs_extent_inline_ref *iref)
 {
        struct btrfs_key key;
@@ -1883,10 +1882,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static int btrfs_issue_discard(struct block_device *bdev,
-                               u64 start, u64 len)
+#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+                              u64 *discarded_bytes)
 {
-       return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+       int j, ret = 0;
+       u64 bytes_left, end;
+       u64 aligned_start = ALIGN(start, 1 << 9);
+
+       if (WARN_ON(start != aligned_start)) {
+               len -= aligned_start - start;
+               len = round_down(len, 1 << 9);
+               start = aligned_start;
+       }
+
+       *discarded_bytes = 0;
+
+       if (!len)
+               return 0;
+
+       end = start + len;
+       bytes_left = len;
+
+       /* Skip any superblocks on this device. */
+       for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+               u64 sb_start = btrfs_sb_offset(j);
+               u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+               u64 size = sb_start - start;
+
+               if (!in_range(sb_start, start, bytes_left) &&
+                   !in_range(sb_end, start, bytes_left) &&
+                   !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+                       continue;
+
+               /*
+                * Superblock spans beginning of range.  Adjust start and
+                * try again.
+                */
+               if (sb_start <= start) {
+                       start += sb_end - start;
+                       if (start > end) {
+                               bytes_left = 0;
+                               break;
+                       }
+                       bytes_left = end - start;
+                       continue;
+               }
+
+               if (size) {
+                       ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+                                                  GFP_NOFS, 0);
+                       if (!ret)
+                               *discarded_bytes += size;
+                       else if (ret != -EOPNOTSUPP)
+                               return ret;
+               }
+
+               start = sb_end;
+               if (start > end) {
+                       bytes_left = 0;
+                       break;
+               }
+               bytes_left = end - start;
+       }
+
+       if (bytes_left) {
+               ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+                                          GFP_NOFS, 0);
+               if (!ret)
+                       *discarded_bytes += bytes_left;
+       }
+       return ret;
 }
 
 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1907,14 +1973,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
 
                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+                       u64 bytes;
                        if (!stripe->dev->can_discard)
                                continue;
 
                        ret = btrfs_issue_discard(stripe->dev->bdev,
                                                  stripe->physical,
-                                                 stripe->length);
+                                                 stripe->length,
+                                                 &bytes);
                        if (!ret)
-                               discarded_bytes += stripe->length;
+                               discarded_bytes += bytes;
                        else if (ret != -EOPNOTSUPP)
                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 
@@ -2296,9 +2364,22 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 static inline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
+       struct btrfs_delayed_ref_node *ref;
+
        if (list_empty(&head->ref_list))
                return NULL;
 
+       /*
+        * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+        * This is to prevent a ref count from going down to zero, which deletes
+        * the extent item from the extent tree, when there still are references
+        * to add, which would fail because they would not find the extent item.
+        */
+       list_for_each_entry(ref, &head->ref_list, list) {
+               if (ref->action == BTRFS_ADD_DELAYED_REF)
+                       return ref;
+       }
+
        return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
                          list);
 }
@@ -3661,10 +3742,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
-       if (total_bytes > 0)
-               found->full = 0;
-       else
-               found->full = 1;
+       found->full = 0;
        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
        found->chunk_alloc = 0;
        found->flush = 0;
@@ -4214,6 +4292,24 @@ out:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
        mutex_unlock(&fs_info->chunk_mutex);
+       /*
+        * When we allocate a new chunk we reserve space in the chunk block
+        * reserve to make sure we can COW nodes/leafs in the chunk tree or
+        * add new nodes/leafs to it if we end up needing to do it when
+        * inserting the chunk item and updating device items as part of the
+        * second phase of chunk allocation, performed by
+        * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+        * large number of new block groups to create in our transaction
+        * handle's new_bgs list to avoid exhausting the chunk block reserve
+        * in extreme cases - like having a single transaction create many new
+        * block groups when starting to write out the free space caches of all
+        * the block groups that were made dirty during the lifetime of the
+        * transaction.
+        */
+       if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+               btrfs_create_pending_block_groups(trans, trans->root);
+               btrfs_trans_release_chunk_metadata(trans);
+       }
        return ret;
 }
 
@@ -6031,20 +6127,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_group_cache *block_group, *tmp;
+       struct list_head *deleted_bgs;
        struct extent_io_tree *unpin;
        u64 start;
        u64 end;
        int ret;
 
-       if (trans->aborted)
-               return 0;
-
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
                unpin = &fs_info->freed_extents[1];
        else
                unpin = &fs_info->freed_extents[0];
 
-       while (1) {
+       while (!trans->aborted) {
                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY, NULL);
@@ -6063,6 +6158,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
 
+       /*
+        * Transaction is finished.  We don't need the lock anymore.  We
+        * do need to clean up the block groups in case of a transaction
+        * abort.
+        */
+       deleted_bgs = &trans->transaction->deleted_bgs;
+       list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+               u64 trimmed = 0;
+
+               ret = -EROFS;
+               if (!trans->aborted)
+                       ret = btrfs_discard_extent(root,
+                                                  block_group->key.objectid,
+                                                  block_group->key.offset,
+                                                  &trimmed);
+
+               list_del_init(&block_group->bg_list);
+               btrfs_put_block_group_trimming(block_group);
+               btrfs_put_block_group(block_group);
+
+               if (ret) {
+                       const char *errstr = btrfs_decode_error(ret);
+                       btrfs_warn(fs_info,
+                                  "Discard failed while removing blockgroup: errno=%d %s\n",
+                                  ret, errstr);
+               }
+       }
+
        return 0;
 }
 
@@ -6318,7 +6441,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        } else {
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
-                              extent_data_ref_count(root, path, iref));
+                              extent_data_ref_count(path, iref));
                        if (iref) {
                                BUG_ON(path->slots[0] != extent_slot);
                        } else {
@@ -7536,9 +7659,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 
 /*
  * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
  * returns the tree buffer or an ERR_PTR on error.
  */
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -8545,7 +8665,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
 
        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
-               btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
+               btrfs_add_dropped_root(trans, root);
        } else {
                free_extent_buffer(root->node);
                free_extent_buffer(root->commit_root);
@@ -8692,14 +8812,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        return flags;
 }
 
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 {
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;
        u64 min_allocable_bytes;
        int ret = -ENOSPC;
 
-
        /*
         * We need some metadata space and system metadata space for
         * allocating chunks in some corner cases until we force to set
@@ -8716,6 +8835,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
        spin_lock(&cache->lock);
 
        if (cache->ro) {
+               cache->ro++;
                ret = 0;
                goto out;
        }
@@ -8727,7 +8847,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
            min_allocable_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
-               cache->ro = 1;
+               cache->ro++;
                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
                ret = 0;
        }
@@ -8737,7 +8857,7 @@ out:
        return ret;
 }
 
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache)
 
 {
@@ -8745,8 +8865,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        u64 alloc_flags;
        int ret;
 
-       BUG_ON(cache->ro);
-
 again:
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
@@ -8789,7 +8907,7 @@ again:
                        goto out;
        }
 
-       ret = set_block_group_ro(cache, 0);
+       ret = inc_block_group_ro(cache, 0);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8797,7 +8915,7 @@ again:
                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
-       ret = set_block_group_ro(cache, 0);
+       ret = inc_block_group_ro(cache, 0);
 out:
        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
                alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8860,7 +8978,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
        return free_bytes;
 }
 
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
 {
        struct btrfs_space_info *sinfo = cache->space_info;
@@ -8870,11 +8988,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
 
        spin_lock(&sinfo->lock);
        spin_lock(&cache->lock);
-       num_bytes = cache->key.offset - cache->reserved - cache->pinned -
-                   cache->bytes_super - btrfs_block_group_used(&cache->item);
-       sinfo->bytes_readonly -= num_bytes;
-       cache->ro = 0;
-       list_del_init(&cache->ro_list);
+       if (!--cache->ro) {
+               num_bytes = cache->key.offset - cache->reserved -
+                           cache->pinned - cache->bytes_super -
+                           btrfs_block_group_used(&cache->item);
+               sinfo->bytes_readonly -= num_bytes;
+               list_del_init(&cache->ro_list);
+       }
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
 }
@@ -9390,7 +9510,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid)) {
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
                } else if (btrfs_block_group_used(&cache->item) == 0) {
                        spin_lock(&info->unused_bgs_lock);
                        /* Should always be true but just in case. */
@@ -9418,11 +9538,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                list_for_each_entry(cache,
                                &space_info->block_groups[BTRFS_RAID_RAID0],
                                list)
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
                list_for_each_entry(cache,
                                &space_info->block_groups[BTRFS_RAID_SINGLE],
                                list)
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
        }
 
        init_global_block_rsv(info);
@@ -9803,6 +9923,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * currently running transaction might finish and a new one start,
         * allowing for new block groups to be created that can reuse the same
         * physical device locations unless we take this special care.
+        *
+        * There may also be an implicit trim operation if the file system
+        * is mounted with -odiscard. The same protections must remain
+        * in place until the extents have been discarded completely when
+        * the transaction commit has completed.
         */
        remove_em = (atomic_read(&block_group->trimming) == 0);
        /*
@@ -9877,6 +10002,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
        spin_lock(&fs_info->unused_bgs_lock);
        while (!list_empty(&fs_info->unused_bgs)) {
                u64 start, end;
+               int trimming;
 
                block_group = list_first_entry(&fs_info->unused_bgs,
                                               struct btrfs_block_group_cache,
@@ -9910,7 +10036,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                spin_unlock(&block_group->lock);
 
                /* We don't want to force the issue, only flip if it's ok. */
-               ret = set_block_group_ro(block_group, 0);
+               ret = inc_block_group_ro(block_group, 0);
                up_write(&space_info->groups_sem);
                if (ret < 0) {
                        ret = 0;
@@ -9924,7 +10050,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                /* 1 for btrfs_orphan_reserve_metadata() */
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                        ret = PTR_ERR(trans);
                        goto next;
                }
@@ -9951,14 +10077,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                        goto end_trans;
                }
                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                        goto end_trans;
                }
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -9976,12 +10102,39 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                spin_unlock(&block_group->lock);
                spin_unlock(&space_info->lock);
 
+               /* DISCARD can flip during remount */
+               trimming = btrfs_test_opt(root, DISCARD);
+
+               /* Implicit trim during transaction commit. */
+               if (trimming)
+                       btrfs_get_block_group_trimming(block_group);
+
                /*
                 * Btrfs_remove_chunk will abort the transaction if things go
                 * horribly wrong.
                 */
                ret = btrfs_remove_chunk(trans, root,
                                         block_group->key.objectid);
+
+               if (ret) {
+                       if (trimming)
+                               btrfs_put_block_group_trimming(block_group);
+                       goto end_trans;
+               }
+
+               /*
+                * If we're not mounted with -odiscard, we can just forget
+                * about this block group. Otherwise we'll need to wait
+                * until transaction commit to do the actual discard.
+                */
+               if (trimming) {
+                       WARN_ON(!list_empty(&block_group->bg_list));
+                       spin_lock(&trans->transaction->deleted_bgs_lock);
+                       list_move(&block_group->bg_list,
+                                 &trans->transaction->deleted_bgs);
+                       spin_unlock(&trans->transaction->deleted_bgs_lock);
+                       btrfs_get_block_group(block_group);
+               }
 end_trans:
                btrfs_end_transaction(trans, root);
 next:
@@ -10035,10 +10188,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
        return unpin_extent_range(root, start, end, false);
 }
 
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space.  Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time.  We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses.  For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+                                  u64 minlen, u64 *trimmed)
+{
+       u64 start = 0, len = 0;
+       int ret;
+
+       *trimmed = 0;
+
+       /* Not writeable = nothing to do. */
+       if (!device->writeable)
+               return 0;
+
+       /* No free space = nothing to do. */
+       if (device->total_bytes <= device->bytes_used)
+               return 0;
+
+       ret = 0;
+
+       while (1) {
+               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+               struct btrfs_transaction *trans;
+               u64 bytes;
+
+               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+               if (ret)
+                       return ret;
+
+               down_read(&fs_info->commit_root_sem);
+
+               spin_lock(&fs_info->trans_lock);
+               trans = fs_info->running_transaction;
+               if (trans)
+                       atomic_inc(&trans->use_count);
+               spin_unlock(&fs_info->trans_lock);
+
+               ret = find_free_dev_extent_start(trans, device, minlen, start,
+                                                &start, &len);
+               if (trans)
+                       btrfs_put_transaction(trans);
+
+               if (ret) {
+                       up_read(&fs_info->commit_root_sem);
+                       mutex_unlock(&fs_info->chunk_mutex);
+                       if (ret == -ENOSPC)
+                               ret = 0;
+                       break;
+               }
+
+               ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+               up_read(&fs_info->commit_root_sem);
+               mutex_unlock(&fs_info->chunk_mutex);
+
+               if (ret)
+                       break;
+
+               start += len;
+               *trimmed += bytes;
+
+               if (fatal_signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+
+               cond_resched();
+       }
+
+       return ret;
+}
+
 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_device *device;
+       struct list_head *devices;
        u64 group_trimmed;
        u64 start;
        u64 end;
@@ -10093,6 +10335,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
                cache = next_block_group(fs_info->tree_root, cache);
        }
 
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       devices = &root->fs_info->fs_devices->alloc_list;
+       list_for_each_entry(device, devices, dev_alloc_list) {
+               ret = btrfs_trim_free_extents(device, range->minlen,
+                                             &group_trimmed);
+               if (ret)
+                       break;
+
+               trimmed += group_trimmed;
+       }
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
        range->len = trimmed;
        return ret;
 }