Btrfs: fix a comment in inode.c:evict_inode_truncate_pages()
[cascardo/linux.git] / fs / btrfs / inode.c
index 8bb0136..53afda0 100644 (file)
@@ -4986,24 +4986,41 @@ static void evict_inode_truncate_pages(struct inode *inode)
        }
        write_unlock(&map_tree->lock);
 
+       /*
+        * Keep looping until we have no more ranges in the io tree.
+        * We can have ongoing bios started by readpages (called from readahead)
+        * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+        * still in progress (unlocked the pages in the bio but did not yet
+        * unlocked the ranges in the io tree). Therefore this means some
+        * ranges can still be locked and eviction started because before
+        * submitting those bios, which are executed by a separate task (work
+        * queue kthread), inode references (inode->i_count) were not taken
+        * (which would be dropped in the end io callback of each bio).
+        * Therefore here we effectively end up waiting for those bios and
+        * anyone else holding locked ranges without having bumped the inode's
+        * reference count - if we don't do it, when they access the inode's
+        * io_tree to unlock a range it may be too late, leading to an
+        * use-after-free issue.
+        */
        spin_lock(&io_tree->lock);
        while (!RB_EMPTY_ROOT(&io_tree->state)) {
                struct extent_state *state;
                struct extent_state *cached_state = NULL;
+               u64 start;
+               u64 end;
 
                node = rb_first(&io_tree->state);
                state = rb_entry(node, struct extent_state, rb_node);
-               atomic_inc(&state->refs);
+               start = state->start;
+               end = state->end;
                spin_unlock(&io_tree->lock);
 
-               lock_extent_bits(io_tree, state->start, state->end,
-                                0, &cached_state);
-               clear_extent_bit(io_tree, state->start, state->end,
+               lock_extent_bits(io_tree, start, end, 0, &cached_state);
+               clear_extent_bit(io_tree, start, end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
                                 EXTENT_DEFRAG, 1, 1,
                                 &cached_state, GFP_NOFS);
-               free_extent_state(state);
 
                cond_resched();
                spin_lock(&io_tree->lock);
@@ -8147,9 +8164,8 @@ out_err:
 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
                                struct inode *inode, loff_t file_offset)
 {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_dio_private *dip;
-       struct bio *io_bio;
+       struct btrfs_dio_private *dip = NULL;
+       struct bio *io_bio = NULL;
        struct btrfs_io_bio *btrfs_bio;
        int skip_sum;
        int write = rw & REQ_WRITE;
@@ -8166,7 +8182,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        dip = kzalloc(sizeof(*dip), GFP_NOFS);
        if (!dip) {
                ret = -ENOMEM;
-               goto free_io_bio;
+               goto free_ordered;
        }
 
        dip->private = dio_bio->bi_private;
@@ -8194,25 +8210,55 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 
        if (btrfs_bio->end_io)
                btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
-       bio_put(io_bio);
 
 free_ordered:
        /*
-        * If this is a write, we need to clean up the reserved space and kill
-        * the ordered extent.
+        * If we arrived here it means either we failed to submit the dip
+        * or we either failed to clone the dio_bio or failed to allocate the
+        * dip. If we cloned the dio_bio and allocated the dip, we can just
+        * call bio_endio against our io_bio so that we get proper resource
+        * cleanup if we fail to submit the dip, otherwise, we must do the
+        * same as btrfs_endio_direct_[write|read] because we can't call these
+        * callbacks - they require an allocated dip and a clone of dio_bio.
         */
-       if (write) {
-               struct btrfs_ordered_extent *ordered;
-               ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-               if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
-                   !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
-                       btrfs_free_reserved_extent(root, ordered->start,
-                                                  ordered->disk_len, 1);
-               btrfs_put_ordered_extent(ordered);
-               btrfs_put_ordered_extent(ordered);
+       if (io_bio && dip) {
+               bio_endio(io_bio, ret);
+               /*
+                * The end io callbacks free our dip, do the final put on io_bio
+                * and all the cleanup and final put for dio_bio (through
+                * dio_end_io()).
+                */
+               dip = NULL;
+               io_bio = NULL;
+       } else {
+               if (write) {
+                       struct btrfs_ordered_extent *ordered;
+
+                       ordered = btrfs_lookup_ordered_extent(inode,
+                                                             file_offset);
+                       set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+                       /*
+                        * Decrements our ref on the ordered extent and removes
+                        * the ordered extent from the inode's ordered tree,
+                        * doing all the proper resource cleanup such as for the
+                        * reserved space and waking up any waiters for this
+                        * ordered extent (through btrfs_remove_ordered_extent).
+                        */
+                       btrfs_finish_ordered_io(ordered);
+               } else {
+                       unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+                             file_offset + dio_bio->bi_iter.bi_size - 1);
+               }
+               clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+               /*
+                * Releases and cleans up our dio_bio, no need to bio_put()
+                * nor bio_endio()/bio_io_error() against dio_bio.
+                */
+               dio_end_io(dio_bio, ret);
        }
-       bio_endio(dio_bio, ret);
+       if (io_bio)
+               bio_put(io_bio);
+       kfree(dip);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,