Btrfs: add semaphore to synchronize direct IO writes with fsync

author Filipe Manana <fdmanana@suse.com>

Thu, 12 May 2016 12:53:36 +0000 (13:53 +0100)

committer Filipe Manana <fdmanana@suse.com>

Fri, 13 May 2016 00:59:36 +0000 (01:59 +0100)
author Filipe Manana <fdmanana@suse.com>
Thu, 12 May 2016 12:53:36 +0000 (13:53 +0100)
committer Filipe Manana <fdmanana@suse.com>
Fri, 13 May 2016 00:59:36 +0000 (01:59 +0100)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 61205e3..1da5753 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -196,6 +196,16 @@ struct btrfs_inode {
         struct list_head delayed_iput;
         long delayed_iput_count;
  
+       /*
+        * To avoid races between lockless (i_mutex not held) direct IO writes
+        * and concurrent fsync requests. Direct IO writes must acquire read
+        * access on this semaphore for creating an extent map and its
+        * corresponding ordered extent. The fast fsync path must acquire write
+        * access on this semaphore before it collects ordered extents and
+        * extent maps.
+        */
+       struct rw_semaphore dio_sem;
+
         struct inode vfs_inode;
  };
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index ee9be41..c1ee4ad 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7145,6 +7145,43 @@ out:
         return em;
  }
  
+static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+                                                 const u64 start,
+                                                 const u64 len,
+                                                 const u64 orig_start,
+                                                 const u64 block_start,
+                                                 const u64 block_len,
+                                                 const u64 orig_block_len,
+                                                 const u64 ram_bytes,
+                                                 const int type)
+{
+       struct extent_map *em = NULL;
+       int ret;
+
+       down_read(&BTRFS_I(inode)->dio_sem);
+       if (type != BTRFS_ORDERED_NOCOW) {
+               em = create_pinned_em(inode, start, len, orig_start,
+                                     block_start, block_len, orig_block_len,
+                                     ram_bytes, type);
+               if (IS_ERR(em))
+                       goto out;
+       }
+       ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+                                          len, block_len, type);
+       if (ret) {
+               if (em) {
+                       free_extent_map(em);
+                       btrfs_drop_extent_cache(inode, start,
+                                               start + len - 1, 0);
+               }
+               em = ERR_PTR(ret);
+       }
+ out:
+       up_read(&BTRFS_I(inode)->dio_sem);
+
+       return em;
+}
+
  static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                   u64 start, u64 len)
  {
@@ -7160,43 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
         if (ret)
                 return ERR_PTR(ret);
  
-       /*
-        * Create the ordered extent before the extent map. This is to avoid
-        * races with the fast fsync path that would lead to it logging file
-        * extent items that point to disk extents that were not yet written to.
-        * The fast fsync path collects ordered extents into a local list and
-        * then collects all the new extent maps, so we must create the ordered
-        * extent first and make sure the fast fsync path collects any new
-        * ordered extents after collecting new extent maps as well.
-        * The fsync path simply can not rely on inode_dio_wait() because it
-        * causes deadlock with AIO.
-        */
-       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
-                                          ins.offset, ins.offset, 0);
-       if (ret) {
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return ERR_PTR(ret);
-       }
-
+       em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+                                    ins.objectid, ins.offset, ins.offset,
+                                    ins.offset, 0);
         btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
-
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               struct btrfs_ordered_extent *oe;
-
+       if (IS_ERR(em))
                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               oe = btrfs_lookup_ordered_extent(inode, start);
-               ASSERT(oe);
-               if (WARN_ON(!oe))
-                       return em;
-               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
-               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
-               btrfs_remove_ordered_extent(inode, oe);
-               /* Once for our lookup and once for the ordered extents tree. */
-               btrfs_put_ordered_extent(oe);
-               btrfs_put_ordered_extent(oe);
-       }
+
         return em;
  }
  
@@ -7670,57 +7677,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 if (can_nocow_extent(inode, start, &len, &orig_start,
                                      &orig_block_len, &ram_bytes) == 1 &&
                     btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+                       struct extent_map *em2;
  
-                       /*
-                        * Create the ordered extent before the extent map. This
-                        * is to avoid races with the fast fsync path because it
-                        * collects ordered extents into a local list and then
-                        * collects all the new extent maps, so we must create
-                        * the ordered extent first and make sure the fast fsync
-                        * path collects any new ordered extents after
-                        * collecting new extent maps as well. The fsync path
-                        * simply can not rely on inode_dio_wait() because it
-                        * causes deadlock with AIO.
-                        */
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
+                       em2 = btrfs_create_dio_extent(inode, start, len,
+                                                     orig_start, block_start,
+                                                     len, orig_block_len,
+                                                     ram_bytes, type);
                         btrfs_dec_nocow_writers(root->fs_info, block_start);
-                       if (ret) {
-                               free_extent_map(em);
-                               goto unlock_err;
-                       }
-
                         if (type == BTRFS_ORDERED_PREALLOC) {
                                 free_extent_map(em);
-                               em = create_pinned_em(inode, start, len,
-                                                      orig_start,
-                                                      block_start, len,
-                                                      orig_block_len,
-                                                      ram_bytes, type);
-                               if (IS_ERR(em)) {
-                                       struct btrfs_ordered_extent *oe;
-
-                                       ret = PTR_ERR(em);
-                                       oe = btrfs_lookup_ordered_extent(inode,
-                                                                        start);
-                                       ASSERT(oe);
-                                       if (WARN_ON(!oe))
-                                               goto unlock_err;
-                                       set_bit(BTRFS_ORDERED_IOERR,
-                                               &oe->flags);
-                                       set_bit(BTRFS_ORDERED_IO_DONE,
-                                               &oe->flags);
-                                       btrfs_remove_ordered_extent(inode, oe);
-                                       /*
-                                        * Once for our lookup and once for the
-                                        * ordered extents tree.
-                                        */
-                                       btrfs_put_ordered_extent(oe);
-                                       btrfs_put_ordered_extent(oe);
-                                       goto unlock_err;
-                               }
+                               em = em2;
+                       }
+                       if (em2 && IS_ERR(em2)) {
+                               ret = PTR_ERR(em2);
+                               goto unlock_err;
                         }
-
                         goto unlock;
                 }
         }
@@ -9281,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         INIT_LIST_HEAD(&ei->delalloc_inodes);
         INIT_LIST_HEAD(&ei->delayed_iput);
         RB_CLEAR_NODE(&ei->rb_node);
+       init_rwsem(&ei->dio_sem);
  
         return inode;
  }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index a24a0ba..003a826 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
  
         INIT_LIST_HEAD(&extents);
  
+       down_write(&BTRFS_I(inode)->dio_sem);
         write_lock(&tree->lock);
         test_gen = root->fs_info->last_trans_committed;
  
@@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
         }
  
         list_sort(NULL, &extents, extent_cmp);
+       btrfs_get_logged_extents(inode, logged_list, start, end);
         /*
-        * Collect any new ordered extents within the range. This is to
-        * prevent logging file extent items without waiting for the disk
-        * location they point to being written. We do this only to deal
-        * with races against concurrent lockless direct IO writes.
+        * Some ordered extents started by fsync might have completed
+        * before we could collect them into the list logged_list, which
+        * means they're gone, not in our logged_list nor in the inode's
+        * ordered tree. We want the application/user space to know an
+        * error happened while attempting to persist file data so that
+        * it can take proper action. If such error happened, we leave
+        * without writing to the log tree and the fsync must report the
+        * file data write error and not commit the current transaction.
          */
-       btrfs_get_logged_extents(inode, logged_list, start, end);
+       ret = btrfs_inode_check_errors(inode);
+       if (ret)
+               ctx->io_err = ret;
  process:
         while (!list_empty(&extents)) {
                 em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4210,7 @@ process:
         }
         WARN_ON(!list_empty(&extents));
         write_unlock(&tree->lock);
+       up_write(&BTRFS_I(inode)->dio_sem);
  
         btrfs_release_path(path);
         return ret;
@@ -4622,23 +4631,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  
         mutex_lock(&BTRFS_I(inode)->log_mutex);
  
-       /*
-        * Collect ordered extents only if we are logging data. This is to
-        * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
-        * will process the ordered extents if they still exists at the time,
-        * because when we collect them we test and set for the flag
-        * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
-        * same ordered extents. The consequence for the LOG_INODE_ALL log mode
-        * not processing the ordered extents is that we end up logging the
-        * corresponding file extent items, based on the extent maps in the
-        * inode's extent_map_tree's modified_list, without logging the
-        * respective checksums (since the may still be only attached to the
-        * ordered extents and have not been inserted in the csum tree by
-        * btrfs_finish_ordered_io() yet).
-        */
-       if (inode_only == LOG_INODE_ALL)
-               btrfs_get_logged_extents(inode, &logged_list, start, end);
-
         /*
          * a brute force approach to making sure we get the most uptodate
          * copies of everything.
@@ -4846,21 +4838,6 @@ log_extents:
                         goto out_unlock;
         }
         if (fast_search) {
-               /*
-                * Some ordered extents started by fsync might have completed
-                * before we collected the ordered extents in logged_list, which
-                * means they're gone, not in our logged_list nor in the inode's
-                * ordered tree. We want the application/user space to know an
-                * error happened while attempting to persist file data so that
-                * it can take proper action. If such error happened, we leave
-                * without writing to the log tree and the fsync must report the
-                * file data write error and not commit the current transaction.
-                */
-               err = btrfs_inode_check_errors(inode);
-               if (err) {
-                       ctx->io_err = err;
-                       goto out_unlock;
-               }
                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                 &logged_list, ctx, start, end);
                 if (ret) {
author	Filipe Manana <fdmanana@suse.com>
	Thu, 12 May 2016 12:53:36 +0000 (13:53 +0100)
committer	Filipe Manana <fdmanana@suse.com>
	Fri, 13 May 2016 00:59:36 +0000 (01:59 +0100)
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history