Btrfs: fix file read corruption after extent cloning and fsync

author Filipe Manana <fdmanana@suse.com>

Wed, 19 Aug 2015 10:09:40 +0000 (11:09 +0100)

committer Chris Mason <clm@fb.com>

Wed, 19 Aug 2015 21:27:46 +0000 (14:27 -0700)
author Filipe Manana <fdmanana@suse.com>
Wed, 19 Aug 2015 10:09:40 +0000 (11:09 +0100)
committer Chris Mason <clm@fb.com>
Wed, 19 Aug 2015 21:27:46 +0000 (14:27 -0700)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 6d65046..1bbaace 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -722,11 +722,65 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                 &ordered_sums, 0);
                         if (ret)
                                 goto out;
+                       /*
+                        * Now delete all existing cums in the csum root that
+                        * cover our range. We do this because we can have an
+                        * extent that is completely referenced by one file
+                        * extent item and partially referenced by another
+                        * file extent item (like after using the clone or
+                        * extent_same ioctls). In this case if we end up doing
+                        * the replay of the one that partially references the
+                        * extent first, and we do not do the csum deletion
+                        * below, we can get 2 csum items in the csum tree that
+                        * overlap each other. For example, imagine our log has
+                        * the two following file extent items:
+                        *
+                        * key (257 EXTENT_DATA 409600)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 20480 nr 20480 ram 102400
+                        *
+                        * key (257 EXTENT_DATA 819200)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 0 nr 102400 ram 102400
+                        *
+                        * Where the second one fully references the 100K extent
+                        * that starts at disk byte 12845056, and the log tree
+                        * has a single csum item that covers the entire range
+                        * of the extent:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        *
+                        * After the first file extent item is replayed, the
+                        * csum tree gets the following csum item:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which covers the 20K sub-range starting at offset 20K
+                        * of our extent. Now when we replay the second file
+                        * extent item, if we do not delete existing csum items
+                        * that cover any of its blocks, we end up getting two
+                        * csum items in our csum tree that overlap each other:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which is a problem, because after this anyone trying
+                        * to lookup up for the checksum of any block of our
+                        * extent starting at an offset of 40K or higher, will
+                        * end up looking at the second csum item only, which
+                        * does not contain the checksum for any block starting
+                        * at offset 40K or higher of our extent.
+                        */
                         while (!list_empty(&ordered_sums)) {
                                 struct btrfs_ordered_sum *sums;
                                 sums = list_entry(ordered_sums.next,
                                                 struct btrfs_ordered_sum,
                                                 list);
+                               if (!ret)
+                                       ret = btrfs_del_csums(trans,
+                                                     root->fs_info->csum_root,
+                                                     sums->bytenr,
+                                                     sums->len);
                                 if (!ret)
                                         ret = btrfs_csum_file_blocks(trans,
                                                 root->fs_info->csum_root,
author	Filipe Manana <fdmanana@suse.com>
	Wed, 19 Aug 2015 10:09:40 +0000 (11:09 +0100)
committer	Chris Mason <clm@fb.com>
	Wed, 19 Aug 2015 21:27:46 +0000 (14:27 -0700)