btrfs: fallocate: Add support to accurate qgroup reserve
[cascardo/linux.git] / fs / btrfs / file.c
index b823fac..1243205 100644 (file)
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        u64 release_bytes = 0;
        u64 lockstart;
        u64 lockend;
-       unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        if (!pages)
                return -ENOMEM;
 
-       first_index = pos >> PAGE_CACHE_SHIFT;
-
        while (iov_iter_count(i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
                size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
 
                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-               ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
-               if (ret == -ENOSPC &&
-                   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
-                                             BTRFS_INODE_PREALLOC))) {
+
+               if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                            BTRFS_INODE_PREALLOC)) {
                        ret = check_can_nocow(inode, pos, &write_bytes);
+                       if (ret < 0)
+                               break;
                        if (ret > 0) {
+                               /*
+                                * For nodata cow case, no need to reserve
+                                * data space.
+                                */
                                only_release_metadata = true;
                                /*
                                 * our prealloc extent may be smaller than
@@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                num_pages = DIV_ROUND_UP(write_bytes + offset,
                                                         PAGE_CACHE_SIZE);
                                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-                               ret = 0;
-                       } else {
-                               ret = -ENOSPC;
+                               goto reserve_metadata;
                        }
                }
-
-               if (ret)
+               ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+               if (ret < 0)
                        break;
 
+reserve_metadata:
                ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
                if (ret) {
                        if (!only_release_metadata)
-                               btrfs_free_reserved_data_space(inode,
-                                                              reserve_bytes);
+                               btrfs_free_reserved_data_space(inode, pos,
+                                                              write_bytes);
                        else
                                btrfs_end_write_no_snapshoting(root);
                        break;
@@ -1607,7 +1608,7 @@ again:
                                btrfs_delalloc_release_metadata(inode,
                                                                release_bytes);
                        else
-                               btrfs_delalloc_release_space(inode,
+                               btrfs_delalloc_release_space(inode, pos,
                                                             release_bytes);
                }
 
@@ -1660,7 +1661,7 @@ again:
                        btrfs_end_write_no_snapshoting(root);
                        btrfs_delalloc_release_metadata(inode, release_bytes);
                } else {
-                       btrfs_delalloc_release_space(inode, release_bytes);
+                       btrfs_delalloc_release_space(inode, pos, release_bytes);
                }
        }
 
@@ -2266,7 +2267,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        u64 drop_end;
        int ret = 0;
        int err = 0;
-       int rsv_count;
+       unsigned int rsv_count;
        bool same_page;
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        u64 ino_size;
@@ -2541,17 +2542,61 @@ out_only_mutex:
        return err;
 }
 
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+       struct list_head list;
+       u64 start;
+       u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+       struct falloc_range *prev = NULL;
+       struct falloc_range *range = NULL;
+
+       if (list_empty(head))
+               goto insert;
+
+       /*
+        * As fallocate iterate by bytenr order, we only need to check
+        * the last range.
+        */
+       prev = list_entry(head->prev, struct falloc_range, list);
+       if (prev->start + prev->len == start) {
+               prev->len += len;
+               return 0;
+       }
+insert:
+       range = kmalloc(sizeof(*range), GFP_NOFS);
+       if (!range)
+               return -ENOMEM;
+       range->start = start;
+       range->len = len;
+       list_add_tail(&range->list, head);
+       return 0;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
                            loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
        struct extent_state *cached_state = NULL;
+       struct falloc_range *range;
+       struct falloc_range *tmp;
+       struct list_head reserve_list;
        u64 cur_offset;
        u64 last_byte;
        u64 alloc_start;
        u64 alloc_end;
        u64 alloc_hint = 0;
        u64 locked_end;
+       u64 actual_end = 0;
        struct extent_map *em;
        int blocksize = BTRFS_I(inode)->root->sectorsize;
        int ret;
@@ -2567,11 +2612,12 @@ static long btrfs_fallocate(struct file *file, int mode,
                return btrfs_punch_hole(inode, offset, len);
 
        /*
-        * Make sure we have enough space before we do the
-        * allocation.
+        * Only trigger disk allocation, don't trigger qgroup reserve
+        *
+        * For qgroup space, it will be checked later.
         */
-       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
-       if (ret)
+       ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+       if (ret < 0)
                return ret;
 
        mutex_lock(&inode->i_mutex);
@@ -2579,6 +2625,13 @@ static long btrfs_fallocate(struct file *file, int mode,
        if (ret)
                goto out;
 
+       /*
+        * TODO: Move these two operations after we have checked
+        * accurate reserved space, or fallocate can still fail but
+        * with page truncated or size expanded.
+        *
+        * But that's a minor problem and won't do much harm BTW.
+        */
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, i_size_read(inode),
                                        alloc_start);
@@ -2637,10 +2690,10 @@ static long btrfs_fallocate(struct file *file, int mode,
                }
        }
 
+       /* First, check if we exceed the qgroup limit */
+       INIT_LIST_HEAD(&reserve_list);
        cur_offset = alloc_start;
        while (1) {
-               u64 actual_end;
-
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
                if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2706,82 @@ static long btrfs_fallocate(struct file *file, int mode,
                last_byte = min(extent_map_end(em), alloc_end);
                actual_end = min_t(u64, extent_map_end(em), offset + len);
                last_byte = ALIGN(last_byte, blocksize);
-
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                       ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                       last_byte - cur_offset,
-                                                       1 << inode->i_blkbits,
-                                                       offset + len,
-                                                       &alloc_hint);
-               } else if (actual_end > inode->i_size &&
-                          !(mode & FALLOC_FL_KEEP_SIZE)) {
-                       struct btrfs_trans_handle *trans;
-                       struct btrfs_root *root = BTRFS_I(inode)->root;
-
-                       /*
-                        * We didn't need to allocate any more space, but we
-                        * still extended the size of the file so we need to
-                        * update i_size and the inode item.
-                        */
-                       trans = btrfs_start_transaction(root, 1);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                       } else {
-                               inode->i_ctime = CURRENT_TIME;
-                               i_size_write(inode, actual_end);
-                               btrfs_ordered_update_i_size(inode, actual_end,
-                                                           NULL);
-                               ret = btrfs_update_inode(trans, root, inode);
-                               if (ret)
-                                       btrfs_end_transaction(trans, root);
-                               else
-                                       ret = btrfs_end_transaction(trans,
-                                                                   root);
+                       ret = add_falloc_range(&reserve_list, cur_offset,
+                                              last_byte - cur_offset);
+                       if (ret < 0) {
+                               free_extent_map(em);
+                               break;
                        }
+                       ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+                                       last_byte - cur_offset);
+                       if (ret < 0)
+                               break;
                }
                free_extent_map(em);
-               if (ret < 0)
-                       break;
-
                cur_offset = last_byte;
-               if (cur_offset >= alloc_end) {
-                       ret = 0;
+               if (cur_offset >= alloc_end)
                        break;
+       }
+
+       /*
+        * If ret is still 0, means we're OK to fallocate.
+        * Or just cleanup the list and exit.
+        */
+       list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+               if (!ret)
+                       ret = btrfs_prealloc_file_range(inode, mode,
+                                       range->start,
+                                       range->len, 1 << inode->i_blkbits,
+                                       offset + len, &alloc_hint);
+               list_del(&range->list);
+               kfree(range);
+       }
+       if (ret < 0)
+               goto out_unlock;
+
+       if (actual_end > inode->i_size &&
+           !(mode & FALLOC_FL_KEEP_SIZE)) {
+               struct btrfs_trans_handle *trans;
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+
+               /*
+                * We didn't need to allocate any more space, but we
+                * still extended the size of the file so we need to
+                * update i_size and the inode item.
+                */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+               } else {
+                       inode->i_ctime = CURRENT_TIME;
+                       i_size_write(inode, actual_end);
+                       btrfs_ordered_update_i_size(inode, actual_end, NULL);
+                       ret = btrfs_update_inode(trans, root, inode);
+                       if (ret)
+                               btrfs_end_transaction(trans, root);
+                       else
+                               ret = btrfs_end_transaction(trans, root);
                }
        }
+out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
 out:
+       /*
+        * As we waited the extent range, the data_rsv_map must be empty
+        * in the range, as written data range will be released from it.
+        * And for prealloacted extent, it will also be released when
+        * its metadata is written.
+        * So this is completely used as cleanup.
+        */
+       btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+       btrfs_free_reserved_data_space(inode, alloc_start,
+                                      alloc_end - alloc_start);
        return ret;
 }