Btrfs: fix race between fsync and direct IO writes for prealloc extents

[cascardo/linux.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 452cfef..45d0daf 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7658,6 +7658,25 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
  
                 if (can_nocow_extent(inode, start, &len, &orig_start,
                                      &orig_block_len, &ram_bytes) == 1) {
+
+                       /*
+                        * Create the ordered extent before the extent map. This
+                        * is to avoid races with the fast fsync path because it
+                        * collects ordered extents into a local list and then
+                        * collects all the new extent maps, so we must create
+                        * the ordered extent first and make sure the fast fsync
+                        * path collects any new ordered extents after
+                        * collecting new extent maps as well. The fsync path
+                        * simply can not rely on inode_dio_wait() because it
+                        * causes deadlock with AIO.
+                        */
+                       ret = btrfs_add_ordered_extent_dio(inode, start,
+                                          block_start, len, len, type);
+                       if (ret) {
+                               free_extent_map(em);
+                               goto unlock_err;
+                       }
+
                         if (type == BTRFS_ORDERED_PREALLOC) {
                                 free_extent_map(em);
                                 em = create_pinned_em(inode, start, len,
@@ -7666,17 +7685,29 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                                        orig_block_len,
                                                        ram_bytes, type);
                                 if (IS_ERR(em)) {
+                                       struct btrfs_ordered_extent *oe;
+
                                         ret = PTR_ERR(em);
+                                       oe = btrfs_lookup_ordered_extent(inode,
+                                                                        start);
+                                       ASSERT(oe);
+                                       if (WARN_ON(!oe))
+                                               goto unlock_err;
+                                       set_bit(BTRFS_ORDERED_IOERR,
+                                               &oe->flags);
+                                       set_bit(BTRFS_ORDERED_IO_DONE,
+                                               &oe->flags);
+                                       btrfs_remove_ordered_extent(inode, oe);
+                                       /*
+                                        * Once for our lookup and once for the
+                                        * ordered extents tree.
+                                        */
+                                       btrfs_put_ordered_extent(oe);
+                                       btrfs_put_ordered_extent(oe);
                                         goto unlock_err;
                                 }
                         }
  
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
-                       if (ret) {
-                               free_extent_map(em);
-                               goto unlock_err;
-                       }
                         goto unlock;
                 }
         }
@@ -9412,6 +9443,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
         u64 new_idx = 0;
         u64 root_objectid;
         int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
  
         /* we only allow rename subvolume link between subvolumes */
         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
@@ -9456,6 +9489,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(root->fs_info, trans);
         } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9463,7 +9498,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                              btrfs_ino(new_dir), old_idx);
                 if (ret)
                         goto out_fail;
-               btrfs_pin_log_trans(root);
         }
  
         /* And now for the dest. */
@@ -9471,6 +9505,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(dest->fs_info, trans);
         } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, root,
                                              old_dentry->d_name.name,
                                              old_dentry->d_name.len,
@@ -9478,7 +9514,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                              btrfs_ino(old_dir), new_idx);
                 if (ret)
                         goto out_fail;
-               btrfs_pin_log_trans(dest);
         }
  
         /* Update inode version and ctime/mtime. */
@@ -9557,17 +9592,47 @@ static int btrfs_rename_exchange(struct inode *old_dir,
         if (new_inode->i_nlink == 1)
                 BTRFS_I(new_inode)->dir_index = new_idx;
  
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (root_log_pinned) {
                 parent = new_dentry->d_parent;
                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
                 btrfs_end_log_trans(root);
+               root_log_pinned = false;
         }
-       if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (dest_log_pinned) {
                 parent = old_dentry->d_parent;
                 btrfs_log_new_name(trans, new_inode, new_dir, parent);
                 btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
         }
  out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
         ret = btrfs_end_transaction(trans, root);
  out_notrans:
         if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9612,21 +9677,21 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
         ret = btrfs_init_inode_security(trans, inode, dir,
                                 &dentry->d_name);
         if (ret)
-               return ret;
+               goto out;
  
         ret = btrfs_add_nondir(trans, dir, dentry,
                                 inode, 0, index);
         if (ret)
-               return ret;
+               goto out;
  
         ret = btrfs_update_inode(trans, root, inode);
-       if (ret)
-               return ret;
-
+out:
         unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
         iput(inode);
  
-       return 0;
+       return ret;
  }
  
  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -9634,6 +9699,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                            unsigned int flags)
  {
         struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
         struct btrfs_root *root = BTRFS_I(old_dir)->root;
         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
         struct inode *new_inode = d_inode(new_dentry);
@@ -9696,8 +9762,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
          * would require 5 item modifications, so we'll assume they are normal
          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
          * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
          */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 goto out_notrans;