Btrfs: fix a bug in checking whether a inode is already in log
[cascardo/linux.git] / fs / btrfs / inode.c
index 6e8f416..24745b8 100644 (file)
@@ -247,7 +247,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                return 1;
        }
 
-       ret = btrfs_drop_extents(trans, inode, start, aligned_end,
+       ret = btrfs_drop_extents(trans, root, inode, start, aligned_end,
                                 &hint_byte, 1);
        if (ret)
                return ret;
@@ -1008,9 +1008,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
                PAGE_CACHE_SHIFT;
 
-       atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
-
-       if (atomic_read(&root->fs_info->async_delalloc_pages) <
+       if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
            5 * 1024 * 1024 &&
            waitqueue_active(&root->fs_info->async_submit_wait))
                wake_up(&root->fs_info->async_submit_wait);
@@ -1310,6 +1308,7 @@ out_check:
                        em->block_start = disk_bytenr;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                       set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                        while (1) {
                                write_lock(&em_tree->lock);
                                ret = add_extent_mapping(em_tree, em);
@@ -1805,7 +1804,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
-       ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+       ret = btrfs_drop_extents(trans, root, inode, file_pos,
+                                file_pos + num_bytes,
                                 &hint, 0);
        if (ret)
                goto out;
@@ -1885,8 +1885,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                                trans = btrfs_join_transaction_nolock(root);
                        else
                                trans = btrfs_join_transaction(root);
-                       if (IS_ERR(trans))
-                               return PTR_ERR(trans);
+                       if (IS_ERR(trans)) {
+                               ret = PTR_ERR(trans);
+                               trans = NULL;
+                               goto out;
+                       }
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode_fallback(trans, root, inode);
                        if (ret) /* -ENOMEM or corruption */
@@ -1928,11 +1931,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                                                ordered_extent->len,
                                                compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
-               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-                                  ordered_extent->file_offset,
-                                  ordered_extent->len);
        }
-
+       unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                          ordered_extent->file_offset, ordered_extent->len,
+                          trans->transid);
        if (ret < 0) {
                btrfs_abort_transaction(trans, root, ret);
                goto out_unlock;
@@ -1948,6 +1950,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                        btrfs_abort_transaction(trans, root, ret);
                        goto out_unlock;
                }
+       } else {
+               btrfs_set_inode_last_trans(trans, inode);
        }
        ret = 0;
 out_unlock:
@@ -2224,7 +2228,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                        insert = 1;
 #endif
                insert = 1;
-               atomic_dec(&root->orphan_inodes);
+               atomic_inc(&root->orphan_inodes);
        }
 
        if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2589,6 +2593,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+       BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+       /*
+        * If we were modified in the current generation and evicted from memory
+        * and then re-read we need to do a full sync since we don't have any
+        * idea about which extents were modified before we were evicted from
+        * cache.
+        */
+       if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+               set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                       &BTRFS_I(inode)->runtime_flags);
+
        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
        inode->i_generation = BTRFS_I(inode)->generation;
        inode->i_rdev = 0;
@@ -3174,7 +3190,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        inode_inc_iversion(dir);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-       ret = btrfs_update_inode(trans, root, dir);
+       ret = btrfs_update_inode_fallback(trans, root, dir);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
 out:
@@ -3266,8 +3282,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->reada = -1;
 
+       /*
+        * We want to drop from the next block forward in case this new size is
+        * not block aligned since we will be keeping the last block of the
+        * extent just the way it is.
+        */
        if (root->ref_cows || root == root->fs_info->tree_root)
-               btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+               btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
 
        /*
         * This function is also used to drop the items in the log tree before
@@ -3428,12 +3449,6 @@ delete:
 
                if (path->slots[0] == 0 ||
                    path->slots[0] != pending_del_slot) {
-                       if (root->ref_cows &&
-                           BTRFS_I(inode)->location.objectid !=
-                                               BTRFS_FREE_INO_OBJECTID) {
-                               err = -EAGAIN;
-                               goto out;
-                       }
                        if (pending_del_nr) {
                                ret = btrfs_del_items(trans, root, path,
                                                pending_del_slot,
@@ -3576,6 +3591,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (oldsize + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
@@ -3612,6 +3628,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+                       struct extent_map *hole_em;
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
 
@@ -3621,7 +3638,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                break;
                        }
 
-                       err = btrfs_drop_extents(trans, inode, cur_offset,
+                       err = btrfs_drop_extents(trans, root, inode,
+                                                cur_offset,
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
                        if (err) {
@@ -3640,9 +3658,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                break;
                        }
 
-                       btrfs_drop_extent_cache(inode, hole_start,
-                                       last_byte - 1, 0);
+                       btrfs_drop_extent_cache(inode, cur_offset,
+                                               cur_offset + hole_size - 1, 0);
+                       hole_em = alloc_extent_map();
+                       if (!hole_em) {
+                               set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                       &BTRFS_I(inode)->runtime_flags);
+                               goto next;
+                       }
+                       hole_em->start = cur_offset;
+                       hole_em->len = hole_size;
+                       hole_em->orig_start = cur_offset;
+
+                       hole_em->block_start = EXTENT_MAP_HOLE;
+                       hole_em->block_len = 0;
+                       hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+                       hole_em->compress_type = BTRFS_COMPRESS_NONE;
+                       hole_em->generation = trans->transid;
 
+                       while (1) {
+                               write_lock(&em_tree->lock);
+                               err = add_extent_mapping(em_tree, hole_em);
+                               if (!err)
+                                       list_move(&hole_em->list,
+                                                 &em_tree->modified_extents);
+                               write_unlock(&em_tree->lock);
+                               if (err != -EEXIST)
+                                       break;
+                               btrfs_drop_extent_cache(inode, cur_offset,
+                                                       cur_offset +
+                                                       hole_size - 1, 0);
+                       }
+                       free_extent_map(hole_em);
+next:
                        btrfs_update_inode(trans, root, inode);
                        btrfs_end_transaction(trans, root);
                }
@@ -3773,6 +3821,7 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
        rsv->size = min_size;
+       rsv->failfast = 1;
        global_rsv = &root->fs_info->global_block_rsv;
 
        btrfs_i_size_write(inode, 0);
@@ -3817,7 +3866,7 @@ void btrfs_evict_inode(struct inode *inode)
                trans->block_rsv = rsv;
 
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-               if (ret != -EAGAIN)
+               if (ret != -ENOSPC)
                        break;
 
                nr = trans->blocks_used;
@@ -4670,6 +4719,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->generation = trans->transid;
        inode->i_generation = BTRFS_I(inode)->generation;
 
+       /*
+        * We could have gotten an inode number from somebody who was fsynced
+        * and then removed in this same transaction, so let's just set full
+        * sync since it will be a full sync anyway and this will blow away the
+        * old info in the log.
+        */
+       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
        if (S_ISDIR(mode))
                owner = 0;
        else
@@ -5774,18 +5831,112 @@ out:
        return ret;
 }
 
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+                             struct extent_state **cached_state, int writing)
+{
+       struct btrfs_ordered_extent *ordered;
+       int ret = 0;
+
+       while (1) {
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                0, cached_state);
+               /*
+                * We're concerned with the entire range that we're going to be
+                * doing DIO to, so we need to make sure theres no ordered
+                * extents in this range.
+                */
+               ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                    lockend - lockstart + 1);
+
+               /*
+                * We need to make sure there are no buffered pages in this
+                * range either, we could have raced between the invalidate in
+                * generic_file_direct_write and locking the extent.  The
+                * invalidate needs to happen so that reads after a write do not
+                * get stale data.
+                */
+               if (!ordered && (!writing ||
+                   !test_range_bit(&BTRFS_I(inode)->io_tree,
+                                   lockstart, lockend, EXTENT_UPTODATE, 0,
+                                   *cached_state)))
+                       break;
+
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    cached_state, GFP_NOFS);
+
+               if (ordered) {
+                       btrfs_start_ordered_extent(inode, ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+               } else {
+                       /* Screw you mmap */
+                       ret = filemap_write_and_wait_range(inode->i_mapping,
+                                                          lockstart,
+                                                          lockend);
+                       if (ret)
+                               break;
+
+                       /*
+                        * If we found a page that couldn't be invalidated just
+                        * fall back to buffered.
+                        */
+                       ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                       lockstart >> PAGE_CACHE_SHIFT,
+                                       lockend >> PAGE_CACHE_SHIFT);
+                       if (ret)
+                               break;
+               }
+
+               cond_resched();
+       }
+
+       return ret;
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_state *cached_state = NULL;
        u64 start = iblock << inode->i_blkbits;
+       u64 lockstart, lockend;
        u64 len = bh_result->b_size;
        struct btrfs_trans_handle *trans;
+       int unlock_bits = EXTENT_LOCKED;
+       int ret;
+
+       if (create) {
+               ret = btrfs_delalloc_reserve_space(inode, len);
+               if (ret)
+                       return ret;
+               unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+       } else {
+               len = min_t(u64, len, root->sectorsize);
+       }
+
+       lockstart = start;
+       lockend = start + len - 1;
+
+       /*
+        * If this errors out it's because we couldn't invalidate pagecache for
+        * this range and we need to fallback to buffered.
+        */
+       if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+               return -ENOTBLK;
+
+       if (create) {
+               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, EXTENT_DELALLOC, NULL,
+                                    &cached_state, GFP_NOFS);
+               if (ret)
+                       goto unlock_err;
+       }
 
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-       if (IS_ERR(em))
-               return PTR_ERR(em);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto unlock_err;
+       }
 
        /*
         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5804,17 +5955,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
            em->block_start == EXTENT_MAP_INLINE) {
                free_extent_map(em);
-               return -ENOTBLK;
+               ret = -ENOTBLK;
+               goto unlock_err;
        }
 
        /* Just a good old fashioned hole, return */
        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
                free_extent_map(em);
-               /* DIO will do one hole at a time, so just unlock a sector */
-               unlock_extent(&BTRFS_I(inode)->io_tree, start,
-                             start + root->sectorsize - 1);
-               return 0;
+               ret = 0;
+               goto unlock_err;
        }
 
        /*
@@ -5827,8 +5977,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         *
         */
        if (!create) {
-               len = em->len - (start - em->start);
-               goto map;
+               len = min(len, em->len - (start - em->start));
+               lockstart = start + len;
+               goto unlock;
        }
 
        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5860,7 +6011,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                        btrfs_end_transaction(trans, root);
                        if (ret) {
                                free_extent_map(em);
-                               return ret;
+                               goto unlock_err;
                        }
                        goto unlock;
                }
@@ -5873,14 +6024,12 @@ must_cow:
         */
        len = bh_result->b_size;
        em = btrfs_new_extent_direct(inode, em, start, len);
-       if (IS_ERR(em))
-               return PTR_ERR(em);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto unlock_err;
+       }
        len = min(len, em->len - (start - em->start));
 unlock:
-       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-                         EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
-                         0, NULL, GFP_NOFS);
-map:
        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
                inode->i_blkbits;
        bh_result->b_size = len;
@@ -5898,9 +6047,44 @@ map:
                        i_size_write(inode, start + len);
        }
 
+       /*
+        * In the case of write we need to clear and unlock the entire range,
+        * in the case of read we need to unlock only the end area that we
+        * aren't using if there is any left over space.
+        */
+       if (lockstart < lockend) {
+               if (create && len < lockend - lockstart) {
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                        lockstart + len - 1, unlock_bits, 1, 0,
+                                        &cached_state, GFP_NOFS);
+                       /*
+                        * Beside unlock, we also need to cleanup reserved space
+                        * for the left range by attaching EXTENT_DO_ACCOUNTING.
+                        */
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                                        lockstart + len, lockend,
+                                        unlock_bits | EXTENT_DO_ACCOUNTING,
+                                        1, 0, NULL, GFP_NOFS);
+               } else {
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                        lockend, unlock_bits, 1, 0,
+                                        &cached_state, GFP_NOFS);
+               }
+       } else {
+               free_extent_state(cached_state);
+       }
+
        free_extent_map(em);
 
        return 0;
+
+unlock_err:
+       if (create)
+               unlock_bits |= EXTENT_DO_ACCOUNTING;
+
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                        unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+       return ret;
 }
 
 struct btrfs_dio_private {
@@ -5908,7 +6092,6 @@ struct btrfs_dio_private {
        u64 logical_offset;
        u64 disk_bytenr;
        u64 bytes;
-       u32 *csums;
        void *private;
 
        /* number of bios pending for this dio */
@@ -5928,7 +6111,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start;
-       u32 *private = dip->csums;
 
        start = dip->logical_offset;
        do {
@@ -5936,8 +6118,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                        struct page *page = bvec->bv_page;
                        char *kaddr;
                        u32 csum = ~(u32)0;
+                       u64 private = ~(u32)0;
                        unsigned long flags;
 
+                       if (get_state_private(&BTRFS_I(inode)->io_tree,
+                                             start, &private))
+                               goto failed;
                        local_irq_save(flags);
                        kaddr = kmap_atomic(page);
                        csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
@@ -5947,18 +6133,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                        local_irq_restore(flags);
 
                        flush_dcache_page(bvec->bv_page);
-                       if (csum != *private) {
+                       if (csum != private) {
+failed:
                                printk(KERN_ERR "btrfs csum failed ino %llu off"
                                      " %llu csum %u private %u\n",
                                      (unsigned long long)btrfs_ino(inode),
                                      (unsigned long long)start,
-                                     csum, *private);
+                                     csum, (unsigned)private);
                                err = -EIO;
                        }
                }
 
                start += bvec->bv_len;
-               private++;
                bvec++;
        } while (bvec <= bvec_end);
 
@@ -5966,7 +6152,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                      dip->logical_offset + dip->bytes - 1);
        bio->bi_private = dip->private;
 
-       kfree(dip->csums);
        kfree(dip);
 
        /* If we had a csum failure make sure to clear the uptodate flag */
@@ -6072,7 +6257,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
-                                        u32 *csums, int async_submit)
+                                        int async_submit)
 {
        int write = rw & REQ_WRITE;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6105,8 +6290,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                if (ret)
                        goto err;
        } else if (!skip_sum) {
-               ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
-                                         file_offset, csums);
+               ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
                if (ret)
                        goto err;
        }
@@ -6132,10 +6316,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        u64 submit_len = 0;
        u64 map_length;
        int nr_pages = 0;
-       u32 *csums = dip->csums;
        int ret = 0;
        int async_submit = 0;
-       int write = rw & REQ_WRITE;
 
        map_length = orig_bio->bi_size;
        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -6171,16 +6353,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        atomic_inc(&dip->pending_bios);
                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
                                                     file_offset, skip_sum,
-                                                    csums, async_submit);
+                                                    async_submit);
                        if (ret) {
                                bio_put(bio);
                                atomic_dec(&dip->pending_bios);
                                goto out_err;
                        }
 
-                       /* Write's use the ordered csums */
-                       if (!write && !skip_sum)
-                               csums = csums + nr_pages;
                        start_sector += submit_len >> 9;
                        file_offset += submit_len;
 
@@ -6210,7 +6389,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
 submit:
        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-                                    csums, async_submit);
+                                    async_submit);
        if (!ret)
                return 0;
 
@@ -6246,17 +6425,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                ret = -ENOMEM;
                goto free_ordered;
        }
-       dip->csums = NULL;
-
-       /* Write's use the ordered csum stuff, so we don't need dip->csums */
-       if (!write && !skip_sum) {
-               dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
-               if (!dip->csums) {
-                       kfree(dip);
-                       ret = -ENOMEM;
-                       goto free_ordered;
-               }
-       }
 
        dip->private = bio->bi_private;
        dip->inode = inode;
@@ -6341,132 +6509,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
 out:
        return retval;
 }
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-       struct btrfs_ordered_extent *ordered;
-       struct extent_state *cached_state = NULL;
-       u64 lockstart, lockend;
-       ssize_t ret;
-       int writing = rw & WRITE;
-       int write_bits = 0;
-       size_t count = iov_length(iov, nr_segs);
 
        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-                           offset, nr_segs)) {
+                           offset, nr_segs))
                return 0;
-       }
-
-       lockstart = offset;
-       lockend = offset + count - 1;
-
-       if (writing) {
-               ret = btrfs_delalloc_reserve_space(inode, count);
-               if (ret)
-                       goto out;
-       }
-
-       while (1) {
-               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                0, &cached_state);
-               /*
-                * We're concerned with the entire range that we're going to be
-                * doing DIO to, so we need to make sure theres no ordered
-                * extents in this range.
-                */
-               ordered = btrfs_lookup_ordered_range(inode, lockstart,
-                                                    lockend - lockstart + 1);
-
-               /*
-                * We need to make sure there are no buffered pages in this
-                * range either, we could have raced between the invalidate in
-                * generic_file_direct_write and locking the extent.  The
-                * invalidate needs to happen so that reads after a write do not
-                * get stale data.
-                */
-               if (!ordered && (!writing ||
-                   !test_range_bit(&BTRFS_I(inode)->io_tree,
-                                   lockstart, lockend, EXTENT_UPTODATE, 0,
-                                   cached_state)))
-                       break;
-
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                    &cached_state, GFP_NOFS);
-
-               if (ordered) {
-                       btrfs_start_ordered_extent(inode, ordered, 1);
-                       btrfs_put_ordered_extent(ordered);
-               } else {
-                       /* Screw you mmap */
-                       ret = filemap_write_and_wait_range(file->f_mapping,
-                                                          lockstart,
-                                                          lockend);
-                       if (ret)
-                               goto out;
-
-                       /*
-                        * If we found a page that couldn't be invalidated just
-                        * fall back to buffered.
-                        */
-                       ret = invalidate_inode_pages2_range(file->f_mapping,
-                                       lockstart >> PAGE_CACHE_SHIFT,
-                                       lockend >> PAGE_CACHE_SHIFT);
-                       if (ret) {
-                               if (ret == -EBUSY)
-                                       ret = 0;
-                               goto out;
-                       }
-               }
-
-               cond_resched();
-       }
 
-       /*
-        * we don't use btrfs_set_extent_delalloc because we don't want
-        * the dirty or uptodate bits
-        */
-       if (writing) {
-               write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
-               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                    EXTENT_DELALLOC, NULL, &cached_state,
-                                    GFP_NOFS);
-               if (ret) {
-                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                        lockend, EXTENT_LOCKED | write_bits,
-                                        1, 0, &cached_state, GFP_NOFS);
-                       goto out;
-               }
-       }
-
-       free_extent_state(cached_state);
-       cached_state = NULL;
-
-       ret = __blockdev_direct_IO(rw, iocb, inode,
+       return __blockdev_direct_IO(rw, iocb, inode,
                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
                   btrfs_submit_direct, 0);
-
-       if (ret < 0 && ret != -EIOCBQUEUED) {
-               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-                             offset + iov_length(iov, nr_segs) - 1,
-                             EXTENT_LOCKED | write_bits, 1, 0,
-                             &cached_state, GFP_NOFS);
-       } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
-               /*
-                * We're falling back to buffered, unlock the section we didn't
-                * do IO on.
-                */
-               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
-                             offset + iov_length(iov, nr_segs) - 1,
-                             EXTENT_LOCKED | write_bits, 1, 0,
-                             &cached_state, GFP_NOFS);
-       }
-out:
-       free_extent_state(cached_state);
-       return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -6716,6 +6774,7 @@ again:
 
        BTRFS_I(inode)->last_trans = root->fs_info->generation;
        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+       BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
@@ -6790,6 +6849,7 @@ static int btrfs_truncate(struct inode *inode)
        if (!rsv)
                return -ENOMEM;
        rsv->size = min_size;
+       rsv->failfast = 1;
 
        /*
         * 1 for the truncate slack space
@@ -6835,36 +6895,21 @@ static int btrfs_truncate(struct inode *inode)
                                           &BTRFS_I(inode)->runtime_flags))
                btrfs_add_ordered_operation(trans, root, inode);
 
-       while (1) {
-               ret = btrfs_block_rsv_refill(root, rsv, min_size);
-               if (ret) {
-                       /*
-                        * This can only happen with the original transaction we
-                        * started above, every other time we shouldn't have a
-                        * transaction started yet.
-                        */
-                       if (ret == -EAGAIN)
-                               goto end_trans;
-                       err = ret;
-                       break;
-               }
-
-               if (!trans) {
-                       /* Just need the 1 for updating the inode */
-                       trans = btrfs_start_transaction(root, 1);
-                       if (IS_ERR(trans)) {
-                               ret = err = PTR_ERR(trans);
-                               trans = NULL;
-                               break;
-                       }
-               }
-
-               trans->block_rsv = rsv;
+       /*
+        * So if we truncate and then write and fsync we normally would just
+        * write the extents that changed, which is a problem if we need to
+        * first truncate that entire inode.  So set this flag so we write out
+        * all of the extents in the inode to the sync log so we're completely
+        * safe.
+        */
+       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+       trans->block_rsv = rsv;
 
+       while (1) {
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-               if (ret != -EAGAIN) {
+               if (ret != -ENOSPC) {
                        err = ret;
                        break;
                }
@@ -6875,11 +6920,22 @@ static int btrfs_truncate(struct inode *inode)
                        err = ret;
                        break;
                }
-end_trans:
+
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-               trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
+
+               trans = btrfs_start_transaction(root, 2);
+               if (IS_ERR(trans)) {
+                       ret = err = PTR_ERR(trans);
+                       trans = NULL;
+                       break;
+               }
+
+               ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+                                             rsv, min_size);
+               BUG_ON(ret);    /* shouldn't happen */
+               trans->block_rsv = rsv;
        }
 
        if (ret == 0 && inode->i_nlink > 0) {
@@ -6963,6 +7019,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
+       ei->last_log_commit = 0;
 
        spin_lock_init(&ei->lock);
        ei->outstanding_extents = 0;
@@ -7506,6 +7563,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                       loff_t actual_len, u64 *alloc_hint,
                                       struct btrfs_trans_handle *trans)
 {
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
@@ -7546,6 +7605,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
 
+               em = alloc_extent_map();
+               if (!em) {
+                       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                               &BTRFS_I(inode)->runtime_flags);
+                       goto next;
+               }
+
+               em->start = cur_offset;
+               em->orig_start = cur_offset;
+               em->len = ins.offset;
+               em->block_start = ins.objectid;
+               em->block_len = ins.offset;
+               em->bdev = root->fs_info->fs_devices->latest_bdev;
+               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+               em->generation = trans->transid;
+
+               while (1) {
+                       write_lock(&em_tree->lock);
+                       ret = add_extent_mapping(em_tree, em);
+                       if (!ret)
+                               list_move(&em->list,
+                                         &em_tree->modified_extents);
+                       write_unlock(&em_tree->lock);
+                       if (ret != -EEXIST)
+                               break;
+                       btrfs_drop_extent_cache(inode, cur_offset,
+                                               cur_offset + ins.offset - 1,
+                                               0);
+               }
+               free_extent_map(em);
+next:
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
                *alloc_hint = ins.objectid + ins.offset;