Btrfs: fix race between fsync and direct IO writes for prealloc extents
[cascardo/linux.git] / fs / btrfs / inode.c
index 452cfef..45d0daf 100644 (file)
@@ -7658,6 +7658,25 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 
                if (can_nocow_extent(inode, start, &len, &orig_start,
                                     &orig_block_len, &ram_bytes) == 1) {
+
+                       /*
+                        * Create the ordered extent before the extent map. This
+                        * is to avoid races with the fast fsync path because it
+                        * collects ordered extents into a local list and then
+                        * collects all the new extent maps, so we must create
+                        * the ordered extent first and make sure the fast fsync
+                        * path collects any new ordered extents after
+                        * collecting new extent maps as well. The fsync path
+                        * simply can not rely on inode_dio_wait() because it
+                        * causes deadlock with AIO.
+                        */
+                       ret = btrfs_add_ordered_extent_dio(inode, start,
+                                          block_start, len, len, type);
+                       if (ret) {
+                               free_extent_map(em);
+                               goto unlock_err;
+                       }
+
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
                                em = create_pinned_em(inode, start, len,
@@ -7666,17 +7685,29 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                                       orig_block_len,
                                                       ram_bytes, type);
                                if (IS_ERR(em)) {
+                                       struct btrfs_ordered_extent *oe;
+
                                        ret = PTR_ERR(em);
+                                       oe = btrfs_lookup_ordered_extent(inode,
+                                                                        start);
+                                       ASSERT(oe);
+                                       if (WARN_ON(!oe))
+                                               goto unlock_err;
+                                       set_bit(BTRFS_ORDERED_IOERR,
+                                               &oe->flags);
+                                       set_bit(BTRFS_ORDERED_IO_DONE,
+                                               &oe->flags);
+                                       btrfs_remove_ordered_extent(inode, oe);
+                                       /*
+                                        * Once for our lookup and once for the
+                                        * ordered extents tree.
+                                        */
+                                       btrfs_put_ordered_extent(oe);
+                                       btrfs_put_ordered_extent(oe);
                                        goto unlock_err;
                                }
                        }
 
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
-                       if (ret) {
-                               free_extent_map(em);
-                               goto unlock_err;
-                       }
                        goto unlock;
                }
        }
@@ -9412,6 +9443,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
        u64 new_idx = 0;
        u64 root_objectid;
        int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
 
        /* we only allow rename subvolume link between subvolumes */
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
@@ -9456,6 +9489,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(root->fs_info, trans);
        } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
                                             new_dentry->d_name.len,
@@ -9463,7 +9498,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                             btrfs_ino(new_dir), old_idx);
                if (ret)
                        goto out_fail;
-               btrfs_pin_log_trans(root);
        }
 
        /* And now for the dest. */
@@ -9471,6 +9505,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(dest->fs_info, trans);
        } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
                ret = btrfs_insert_inode_ref(trans, root,
                                             old_dentry->d_name.name,
                                             old_dentry->d_name.len,
@@ -9478,7 +9514,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                             btrfs_ino(old_dir), new_idx);
                if (ret)
                        goto out_fail;
-               btrfs_pin_log_trans(dest);
        }
 
        /* Update inode version and ctime/mtime. */
@@ -9557,17 +9592,47 @@ static int btrfs_rename_exchange(struct inode *old_dir,
        if (new_inode->i_nlink == 1)
                BTRFS_I(new_inode)->dir_index = new_idx;
 
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (root_log_pinned) {
                parent = new_dentry->d_parent;
                btrfs_log_new_name(trans, old_inode, old_dir, parent);
                btrfs_end_log_trans(root);
+               root_log_pinned = false;
        }
-       if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (dest_log_pinned) {
                parent = old_dentry->d_parent;
                btrfs_log_new_name(trans, new_inode, new_dir, parent);
                btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
        }
 out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
        ret = btrfs_end_transaction(trans, root);
 out_notrans:
        if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9612,21 +9677,21 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
        ret = btrfs_init_inode_security(trans, inode, dir,
                                &dentry->d_name);
        if (ret)
-               return ret;
+               goto out;
 
        ret = btrfs_add_nondir(trans, dir, dentry,
                                inode, 0, index);
        if (ret)
-               return ret;
+               goto out;
 
        ret = btrfs_update_inode(trans, root, inode);
-       if (ret)
-               return ret;
-
+out:
        unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
        iput(inode);
 
-       return 0;
+       return ret;
 }
 
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -9634,6 +9699,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                           unsigned int flags)
 {
        struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
        struct btrfs_root *root = BTRFS_I(old_dir)->root;
        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
        struct inode *new_inode = d_inode(new_dentry);
@@ -9696,8 +9762,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * would require 5 item modifications, so we'll assume they are normal
         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
         * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
         */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_notrans;