Btrfs: fix regression running delayed references when using qgroups
[cascardo/linux.git] / fs / btrfs / inode.c
index c21ed7b..6f030c2 100644 (file)
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+       /*
+        * Don't forget to free the reserved space, as for inlined extent
+        * it won't count as data extent, free them directly here.
+        * And at reserve time, it's always aligned to page size, so
+        * just free one page here.
+        */
+       btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
        btrfs_free_path(path);
        btrfs_end_transaction(trans, root);
        return ret;
@@ -1769,7 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
-                       btrfs_free_reserved_data_space(inode, len);
+                       btrfs_free_reserved_data_space_noquota(inode,
+                                       state->start, len);
 
                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
                                     root->fs_info->delalloc_batch);
@@ -1992,7 +2000,8 @@ again:
                goto again;
        }
 
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
        if (ret) {
                mapping_set_error(page->mapping, ret);
                end_extent_writepage(page, ret, page_start, page_end);
@@ -2586,7 +2595,7 @@ again:
        ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
                        new->disk_len, 0,
                        backref->root_id, backref->inum,
-                       new->file_pos, 0);      /* start - extent_offset */
+                       new->file_pos); /* start - extent_offset */
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out_free_path;
@@ -2836,6 +2845,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+               /*
+                * For mwrite(mmap + memset to write) case, we still reserve
+                * space for NOCOW range.
+                * As NOCOW won't cause a new delayed ref, just free the space
+                */
+               btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+                                      ordered_extent->len);
                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (nolock)
                        trans = btrfs_join_transaction_nolock(root);
@@ -4524,7 +4541,7 @@ delete:
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
-                                               ino, extent_offset, 0);
+                                               ino, extent_offset);
                        BUG_ON(ret);
                        if (btrfs_should_throttle_delayed_refs(trans, root))
                                btrfs_async_run_delayed_refs(root,
@@ -4638,14 +4655,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
        if ((offset & (blocksize - 1)) == 0 &&
            (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode,
+                       round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
        if (ret)
                goto out;
 
 again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode,
+                               round_down(from, PAGE_CACHE_SIZE),
+                               PAGE_CACHE_SIZE);
                ret = -ENOMEM;
                goto out;
        }
@@ -4713,7 +4733,8 @@ again:
 
 out_unlock:
        if (ret)
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode, page_start,
+                                            PAGE_CACHE_SIZE);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -5111,6 +5132,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
                spin_unlock(&io_tree->lock);
 
                lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+               /*
+                * If still has DELALLOC flag, the extent didn't reach disk,
+                * and its reserved space won't be freed by delayed_ref.
+                * So we need to free its reserved space here.
+                * (Refer to comment in btrfs_invalidatepage, case 2)
+                *
+                * Note, end is the bytenr of last byte, so we need + 1 here.
+                */
+               if (state->state & EXTENT_DELALLOC)
+                       btrfs_qgroup_free_data(inode, start, end - start + 1);
+
                clear_extent_bit(io_tree, start, end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -7644,7 +7677,7 @@ unlock:
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
 
-               btrfs_free_reserved_data_space(inode, len);
+               btrfs_free_reserved_data_space(inode, start, len);
                WARN_ON(dio_data->reserve < len);
                dio_data->reserve -= len;
                current->journal_info = dio_data;
@@ -8434,7 +8467,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                        mutex_unlock(&inode->i_mutex);
                        relock = true;
                }
-               ret = btrfs_delalloc_reserve_space(inode, count);
+               ret = btrfs_delalloc_reserve_space(inode, offset, count);
                if (ret)
                        goto out;
                dio_data.outstanding_extents = div64_u64(count +
@@ -8463,10 +8496,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED) {
                        if (dio_data.reserve)
-                               btrfs_delalloc_release_space(inode,
-                                                       dio_data.reserve);
+                               btrfs_delalloc_release_space(inode, offset,
+                                                            dio_data.reserve);
                } else if (ret >= 0 && (size_t)ret < count)
-                       btrfs_delalloc_release_space(inode,
+                       btrfs_delalloc_release_space(inode, offset,
                                                     count - (size_t)ret);
        }
 out:
@@ -8625,6 +8658,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                }
        }
 
+       /*
+        * Qgroup reserved space handler
+        * Page here will be either
+        * 1) Already written to disk
+        *    In this case, its reserved space is released from data rsv map
+        *    and will be freed by delayed_ref handler finally.
+        *    So even we call qgroup_free_data(), it won't decrease reserved
+        *    space.
+        * 2) Not written to disk
+        *    This means the reserved space should be freed here.
+        */
+       btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
        if (!inode_evicting) {
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8675,7 +8720,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_end;
 
        sb_start_pagefault(inode->i_sb);
-       ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       page_start = page_offset(page);
+       page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
        if (!ret) {
                ret = file_update_time(vma->vm_file);
                reserved = 1;
@@ -8694,8 +8743,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 again:
        lock_page(page);
        size = i_size_read(inode);
-       page_start = page_offset(page);
-       page_end = page_start + PAGE_CACHE_SIZE - 1;
 
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
@@ -8772,7 +8819,7 @@ out_unlock:
        }
        unlock_page(page);
 out:
-       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+       btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
 out_noreserve:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@ -9061,6 +9108,7 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
+       btrfs_qgroup_check_reserved_leak(inode);
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
@@ -9697,6 +9745,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        u64 cur_offset = start;
        u64 i_size;
        u64 cur_bytes;
+       u64 last_alloc = (u64)-1;
        int ret = 0;
        bool own_trans = true;
 
@@ -9713,6 +9762,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 
                cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
                cur_bytes = max(cur_bytes, min_size);
+               /*
+                * If we are severely fragmented we could end up with really
+                * small allocations, so if the allocator is returning small
+                * chunks lets make its job easier by only searching for those
+                * sized chunks.
+                */
+               cur_bytes = min(cur_bytes, last_alloc);
                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
                                           *alloc_hint, &ins, 1, 0);
                if (ret) {
@@ -9721,6 +9777,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                        break;
                }
 
+               last_alloc = ins.offset;
                ret = insert_reserved_file_extent(trans, inode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,