}
write_unlock(&map_tree->lock);
+ /*
+ * Keep looping until we have no more ranges in the io tree.
+ * We can have ongoing bios started by readpages (called from readahead)
+ * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * still in progress (unlocked the pages in the bio but did not yet
+ * unlocked the ranges in the io tree). Therefore this means some
+ * ranges can still be locked and eviction started because before
+ * submitting those bios, which are executed by a separate task (work
+ * queue kthread), inode references (inode->i_count) were not taken
+ * (which would be dropped in the end io callback of each bio).
+ * Therefore here we effectively end up waiting for those bios and
+ * anyone else holding locked ranges without having bumped the inode's
+ * reference count - if we don't do it, when they access the inode's
+ * io_tree to unlock a range it may be too late, leading to an
+ * use-after-free issue.
+ */
spin_lock(&io_tree->lock);
while (!RB_EMPTY_ROOT(&io_tree->state)) {
struct extent_state *state;
struct extent_state *cached_state = NULL;
+ u64 start;
+ u64 end;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
- atomic_inc(&state->refs);
+ start = state->start;
+ end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, state->start, state->end,
- 0, &cached_state);
- clear_extent_bit(io_tree, state->start, state->end,
+ lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
- free_extent_state(state);
cond_resched();
spin_lock(&io_tree->lock);
static void btrfs_submit_direct(int rw, struct bio *dio_bio,
struct inode *inode, loff_t file_offset)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_dio_private *dip;
- struct bio *io_bio;
+ struct btrfs_dio_private *dip = NULL;
+ struct bio *io_bio = NULL;
struct btrfs_io_bio *btrfs_bio;
int skip_sum;
int write = rw & REQ_WRITE;
dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
- goto free_io_bio;
+ goto free_ordered;
}
dip->private = dio_bio->bi_private;
if (btrfs_bio->end_io)
btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
- bio_put(io_bio);
free_ordered:
/*
- * If this is a write, we need to clean up the reserved space and kill
- * the ordered extent.
+ * If we arrived here it means either we failed to submit the dip
+ * or we either failed to clone the dio_bio or failed to allocate the
+ * dip. If we cloned the dio_bio and allocated the dip, we can just
+ * call bio_endio against our io_bio so that we get proper resource
+ * cleanup if we fail to submit the dip, otherwise, we must do the
+ * same as btrfs_endio_direct_[write|read] because we can't call these
+ * callbacks - they require an allocated dip and a clone of dio_bio.
*/
- if (write) {
- struct btrfs_ordered_extent *ordered;
- ordered = btrfs_lookup_ordered_extent(inode, file_offset);
- if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
- btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len, 1);
- btrfs_put_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
+ if (io_bio && dip) {
+ bio_endio(io_bio, ret);
+ /*
+ * The end io callbacks free our dip, do the final put on io_bio
+ * and all the cleanup and final put for dio_bio (through
+ * dio_end_io()).
+ */
+ dip = NULL;
+ io_bio = NULL;
+ } else {
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_extent(inode,
+ file_offset);
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+ /*
+ * Decrements our ref on the ordered extent and removes
+ * the ordered extent from the inode's ordered tree,
+ * doing all the proper resource cleanup such as for the
+ * reserved space and waking up any waiters for this
+ * ordered extent (through btrfs_remove_ordered_extent).
+ */
+ btrfs_finish_ordered_io(ordered);
+ } else {
+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+ file_offset + dio_bio->bi_iter.bi_size - 1);
+ }
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ /*
+ * Releases and cleans up our dio_bio, no need to bio_put()
+ * nor bio_endio()/bio_io_error() against dio_bio.
+ */
+ dio_end_io(dio_bio, ret);
}
- bio_endio(dio_bio, ret);
+ if (io_bio)
+ bio_put(io_bio);
+ kfree(dip);
}
static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,