Btrfs: fix file loss on log replay after renaming a file and fsync
[cascardo/linux.git] / fs / btrfs / ioctl.c
index e65fdc8..0af5ecb 100644 (file)
@@ -59,6 +59,7 @@
 #include "props.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "tree-log.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -347,7 +348,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
        btrfs_update_iflags(inode);
        inode_inc_iversion(inode);
-       inode->i_ctime = CURRENT_TIME;
+       inode->i_ctime = current_fs_time(inode->i_sb);
        ret = btrfs_update_inode(trans, root, inode);
 
        btrfs_end_transaction(trans, root);
@@ -443,7 +444,7 @@ static noinline int create_subvol(struct inode *dir,
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *new_root;
        struct btrfs_block_rsv block_rsv;
-       struct timespec cur_time = CURRENT_TIME;
+       struct timespec cur_time = current_fs_time(dir->i_sb);
        struct inode *inode;
        int ret;
        int err;
@@ -2093,8 +2094,6 @@ static noinline int search_ioctl(struct inode *inode,
                key.offset = (u64)-1;
                root = btrfs_read_fs_root_no_name(info, &key);
                if (IS_ERR(root)) {
-                       btrfs_err(info, "could not find root %llu",
-                              sk->tree_id);
                        btrfs_free_path(path);
                        return -ENOENT;
                }
@@ -2472,6 +2471,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        trans->block_rsv = &block_rsv;
        trans->bytes_reserved = block_rsv.size;
 
+       btrfs_record_snapshot_destroy(trans, dir);
+
        ret = btrfs_unlink_subvol(trans, root, dir,
                                dest->root_key.objectid,
                                dentry->d_name.name,
@@ -2790,24 +2791,29 @@ out:
 static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
 {
        struct page *page;
-       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
        page = grab_cache_page(inode->i_mapping, index);
        if (!page)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        if (!PageUptodate(page)) {
-               if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
-                                                0))
-                       return NULL;
+               int ret;
+
+               ret = btrfs_readpage(NULL, page);
+               if (ret)
+                       return ERR_PTR(ret);
                lock_page(page);
                if (!PageUptodate(page)) {
                        unlock_page(page);
                        page_cache_release(page);
-                       return NULL;
+                       return ERR_PTR(-EIO);
+               }
+               if (page->mapping != inode->i_mapping) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       return ERR_PTR(-EAGAIN);
                }
        }
-       unlock_page(page);
 
        return page;
 }
@@ -2819,17 +2825,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
        pgoff_t index = off >> PAGE_CACHE_SHIFT;
 
        for (i = 0; i < num_pages; i++) {
+again:
                pages[i] = extent_same_get_page(inode, index + i);
-               if (!pages[i])
-                       return -ENOMEM;
+               if (IS_ERR(pages[i])) {
+                       int err = PTR_ERR(pages[i]);
+
+                       if (err == -EAGAIN)
+                               goto again;
+                       pages[i] = NULL;
+                       return err;
+               }
        }
        return 0;
 }
 
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+                            bool retry_range_locking)
 {
-       /* do any pending delalloc/csum calc on src, one way or
-          another, and lock file content */
+       /*
+        * Do any pending delalloc/csum calculations on inode, one way or
+        * another, and lock file content.
+        * The locking order is:
+        *
+        *   1) pages
+        *   2) range in the inode's io tree
+        */
        while (1) {
                struct btrfs_ordered_extent *ordered;
                lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2847,8 +2867,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
                unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
+               if (!retry_range_locking)
+                       return -EAGAIN;
                btrfs_wait_ordered_range(inode, off, len);
        }
+       return 0;
 }
 
 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
@@ -2873,15 +2896,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
        unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 }
 
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
-                                    struct inode *inode2, u64 loff2, u64 len)
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+                                   struct inode *inode2, u64 loff2, u64 len,
+                                   bool retry_range_locking)
 {
+       int ret;
+
        if (inode1 < inode2) {
                swap(inode1, inode2);
                swap(loff1, loff2);
        }
-       lock_extent_range(inode1, loff1, len);
-       lock_extent_range(inode2, loff2, len);
+       ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
+       if (ret)
+               return ret;
+       ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+       if (ret)
+               unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+                             loff1 + len - 1);
+       return ret;
 }
 
 struct cmp_pages {
@@ -2897,11 +2929,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
 
        for (i = 0; i < cmp->num_pages; i++) {
                pg = cmp->src_pages[i];
-               if (pg)
+               if (pg) {
+                       unlock_page(pg);
                        page_cache_release(pg);
+               }
                pg = cmp->dst_pages[i];
-               if (pg)
+               if (pg) {
+                       unlock_page(pg);
                        page_cache_release(pg);
+               }
        }
        kfree(cmp->src_pages);
        kfree(cmp->dst_pages);
@@ -2921,8 +2957,8 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
         * of the array is bounded by len, which is in turn bounded by
         * BTRFS_MAX_DEDUPE_LEN.
         */
-       src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
-       dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+       src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+       dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
        if (!src_pgarr || !dst_pgarr) {
                kfree(src_pgarr);
                kfree(dst_pgarr);
@@ -2962,6 +2998,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
 
                src_page = cmp->src_pages[i];
                dst_page = cmp->dst_pages[i];
+               ASSERT(PageLocked(src_page));
+               ASSERT(PageLocked(dst_page));
 
                addr = kmap_atomic(src_page);
                dst_addr = kmap_atomic(dst_page);
@@ -3074,14 +3112,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                goto out_unlock;
        }
 
+again:
        ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
        if (ret)
                goto out_unlock;
 
        if (same_inode)
-               lock_extent_range(src, same_lock_start, same_lock_len);
+               ret = lock_extent_range(src, same_lock_start, same_lock_len,
+                                       false);
        else
-               btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+               ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+                                              false);
+       /*
+        * If one of the inodes has dirty pages in the respective range or
+        * ordered extents, we need to flush dellaloc and wait for all ordered
+        * extents in the range. We must unlock the pages and the ranges in the
+        * io trees to avoid deadlocks when flushing delalloc (requires locking
+        * pages) and when waiting for ordered extents to complete (they require
+        * range locking).
+        */
+       if (ret == -EAGAIN) {
+               /*
+                * Ranges in the io trees already unlocked. Now unlock all
+                * pages before waiting for all IO to complete.
+                */
+               btrfs_cmp_data_free(&cmp);
+               if (same_inode) {
+                       btrfs_wait_ordered_range(src, same_lock_start,
+                                                same_lock_len);
+               } else {
+                       btrfs_wait_ordered_range(src, loff, len);
+                       btrfs_wait_ordered_range(dst, dst_loff, len);
+               }
+               goto again;
+       }
+       ASSERT(ret == 0);
+       if (WARN_ON(ret)) {
+               /* ranges in the io trees already unlocked */
+               btrfs_cmp_data_free(&cmp);
+               return ret;
+       }
 
        /* pass original length for comparison so we stay within i_size */
        ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3144,7 +3214,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 
        inode_inc_iversion(inode);
        if (!no_time_update)
-               inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+               inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
        /*
         * We round up to the block size at eof when determining which
         * extents to clone above, but shouldn't round up the file size.
@@ -3791,9 +3861,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
                u64 lock_start = min_t(u64, off, destoff);
                u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
 
-               lock_extent_range(src, lock_start, lock_len);
+               ret = lock_extent_range(src, lock_start, lock_len, true);
        } else {
-               btrfs_double_extent_lock(src, off, inode, destoff, len);
+               ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+                                              true);
+       }
+       ASSERT(ret == 0);
+       if (WARN_ON(ret)) {
+               /* ranges in the io trees already unlocked */
+               goto out_unlock;
        }
 
        ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
@@ -3810,8 +3886,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
         * Truncate page cache pages so that future reads will see the cloned
         * data immediately and not the previous data.
         */
-       truncate_inode_pages_range(&inode->i_data, destoff,
-                                  PAGE_CACHE_ALIGN(destoff + len) - 1);
+       truncate_inode_pages_range(&inode->i_data,
+                               round_down(destoff, PAGE_CACHE_SIZE),
+                               round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
 out_unlock:
        if (!same_inode)
                btrfs_double_inode_unlock(src, inode);
@@ -4952,7 +5029,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root_item *root_item = &root->root_item;
        struct btrfs_trans_handle *trans;
-       struct timespec ct = CURRENT_TIME;
+       struct timespec ct = current_fs_time(inode->i_sb);
        int ret = 0;
        int received_uuid_changed;
 
@@ -5183,8 +5260,7 @@ out_unlock:
          .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
          .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
 
-static int btrfs_ioctl_get_supported_features(struct file *file,
-                                             void __user *arg)
+int btrfs_ioctl_get_supported_features(void __user *arg)
 {
        static const struct btrfs_ioctl_feature_flags features[3] = {
                INIT_FEATURE_FLAGS(SUPP),
@@ -5463,7 +5539,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_SET_FSLABEL:
                return btrfs_ioctl_set_fslabel(file, argp);
        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
-               return btrfs_ioctl_get_supported_features(file, argp);
+               return btrfs_ioctl_get_supported_features(argp);
        case BTRFS_IOC_GET_FEATURES:
                return btrfs_ioctl_get_features(file, argp);
        case BTRFS_IOC_SET_FEATURES: