btrfs: use only inline_pages from extent buffer
[cascardo/linux.git] / fs / btrfs / extent_io.c
index 472873a..0306665 100644 (file)
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
 {
        struct rb_node *node;
 
-       if (end < start) {
-               printk(KERN_ERR "btrfs end < start %llu %llu\n",
+       if (end < start)
+               WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
                       (unsigned long long)end,
                       (unsigned long long)start);
-               WARN_ON(1);
-       }
        state->start = start;
        state->end = end;
 
@@ -1836,7 +1834,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
  */
 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
                SetPageUptodate(page);
@@ -1848,7 +1846,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
  */
 static void check_page_locked(struct extent_io_tree *tree, struct page *page)
 {
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
                unlock_page(page);
@@ -1897,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
        if (ret)
                err = ret;
 
-       if (did_repair) {
-               ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
-                                       rec->start + rec->len - 1,
-                                       EXTENT_DAMAGED, GFP_NOFS);
-               if (ret && !err)
-                       err = ret;
-       }
+       ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+                               rec->start + rec->len - 1,
+                               EXTENT_DAMAGED, GFP_NOFS);
+       if (ret && !err)
+               err = ret;
 
        kfree(rec);
        return err;
@@ -1919,12 +1915,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
  * the standard behavior is to write all copies in a raid setup. here we only
  * want to write the one bad copy. so we do the mapping for ourselves and issue
  * submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
  * actually prevents the read that triggered the error from finishing.
  * currently, there can be no more than two copies of every data bit. thus,
  * exactly one rewrite is required.
  */
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                        u64 length, u64 logical, struct page *page,
                        int mirror_num)
 {
@@ -1934,10 +1930,15 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        u64 map_length = 0;
        u64 sector;
        struct btrfs_bio *bbio = NULL;
+       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        int ret;
 
        BUG_ON(!mirror_num);
 
+       /* we can't repair anything in raid56 yet */
+       if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+               return 0;
+
        bio = bio_alloc(GFP_NOFS, 1);
        if (!bio)
                return -EIO;
@@ -1946,7 +1947,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        bio->bi_size = 0;
        map_length = length;
 
-       ret = btrfs_map_block(map_tree, WRITE, logical,
+       ret = btrfs_map_block(fs_info, WRITE, logical,
                              &map_length, &bbio, mirror_num);
        if (ret) {
                bio_put(bio);
@@ -1962,7 +1963,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
                return -EIO;
        }
        bio->bi_bdev = dev->bdev;
-       bio_add_page(bio, page, length, start-page_offset(page));
+       bio_add_page(bio, page, length, start - page_offset(page));
        btrfsic_submit_bio(WRITE_SYNC, bio);
        wait_for_completion(&compl);
 
@@ -1984,14 +1985,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                         int mirror_num)
 {
-       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
        u64 start = eb->start;
        unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
        int ret = 0;
 
        for (i = 0; i < num_pages; i++) {
                struct page *p = extent_buffer_page(eb, i);
-               ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+               ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
                                        start, p, mirror_num);
                if (ret)
                        break;
@@ -2010,7 +2010,7 @@ static int clean_io_failure(u64 start, struct page *page)
        u64 private;
        u64 private_failure;
        struct io_failure_record *failrec;
-       struct btrfs_mapping_tree *map_tree;
+       struct btrfs_fs_info *fs_info;
        struct extent_state *state;
        int num_copies;
        int did_repair = 0;
@@ -2046,15 +2046,16 @@ static int clean_io_failure(u64 start, struct page *page)
        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
 
        if (state && state->start == failrec->start) {
-               map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
-               num_copies = btrfs_num_copies(map_tree, failrec->logical,
-                                               failrec->len);
+               fs_info = BTRFS_I(inode)->root->fs_info;
+               num_copies = btrfs_num_copies(fs_info, failrec->logical,
+                                             failrec->len);
                if (num_copies > 1)  {
-                       ret = repair_io_failure(map_tree, start, failrec->len,
+                       ret = repair_io_failure(fs_info, start, failrec->len,
                                                failrec->logical, page,
                                                failrec->failed_mirror);
                        did_repair = !ret;
                }
+               ret = 0;
        }
 
 out:
@@ -2159,9 +2160,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
                 * clean_io_failure() clean all those errors at once.
                 */
        }
-       num_copies = btrfs_num_copies(
-                             &BTRFS_I(inode)->root->fs_info->mapping_tree,
-                             failrec->logical, failrec->len);
+       num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                     failrec->logical, failrec->len);
        if (num_copies == 1) {
                /*
                 * we only have a single copy of the data, so don't bother with
@@ -2297,8 +2297,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
                struct page *page = bvec->bv_page;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-               start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-                        bvec->bv_offset;
+               start = page_offset(page) + bvec->bv_offset;
                end = start + bvec->bv_len - 1;
 
                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2357,8 +2356,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                         (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-               start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-                       bvec->bv_offset;
+               start = page_offset(page) + bvec->bv_offset;
                end = start + bvec->bv_len - 1;
 
                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2466,10 +2464,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
        return bio;
 }
 
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
 static int __must_check submit_one_bio(int rw, struct bio *bio,
                                       int mirror_num, unsigned long bio_flags)
 {
@@ -2479,7 +2473,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
 
-       start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+       start = page_offset(page) + bvec->bv_offset;
 
        bio->bi_private = NULL;
 
@@ -2497,13 +2491,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
        return ret;
 }
 
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
                     unsigned long offset, size_t size, struct bio *bio,
                     unsigned long bio_flags)
 {
        int ret = 0;
        if (tree->ops && tree->ops->merge_bio_hook)
-               ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+               ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
                                                bio_flags);
        BUG_ON(ret < 0);
        return ret;
@@ -2538,7 +2532,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                                sector;
 
                if (prev_bio_flags != bio_flags || !contig ||
-                   merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
+                   merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
                    bio_add_page(bio, page, page_size, offset) < page_size) {
                        ret = submit_one_bio(rw, bio, mirror_num,
                                             prev_bio_flags);
@@ -2603,7 +2597,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                   unsigned long *bio_flags)
 {
        struct inode *inode = page->mapping->host;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 page_end = start + PAGE_CACHE_SIZE - 1;
        u64 end;
        u64 cur = start;
@@ -2656,6 +2650,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                }
        }
        while (cur <= end) {
+               unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
                if (cur >= last_byte) {
                        char *userpage;
                        struct extent_state *cached = NULL;
@@ -2690,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
-               iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+               iosize = ALIGN(iosize, blocksize);
                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
                        disk_io_size = em->block_len;
                        sector = em->block_start >> 9;
@@ -2743,26 +2739,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        continue;
                }
 
-               ret = 0;
-               if (tree->ops && tree->ops->readpage_io_hook) {
-                       ret = tree->ops->readpage_io_hook(page, cur,
-                                                         cur + iosize - 1);
-               }
-               if (!ret) {
-                       unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-                       pnr -= page->index;
-                       ret = submit_extent_page(READ, tree, page,
+               pnr -= page->index;
+               ret = submit_extent_page(READ, tree, page,
                                         sector, disk_io_size, pg_offset,
                                         bdev, bio, pnr,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
                                         this_bio_flag);
-                       if (!ret) {
-                               nr++;
-                               *bio_flags = this_bio_flag;
-                       }
-               }
-               if (ret) {
+               if (!ret) {
+                       nr++;
+                       *bio_flags = this_bio_flag;
+               } else {
                        SetPageError(page);
                        unlock_extent(tree, cur, cur + iosize - 1);
                }
@@ -2814,7 +2801,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        struct inode *inode = page->mapping->host;
        struct extent_page_data *epd = data;
        struct extent_io_tree *tree = epd->tree;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 delalloc_start;
        u64 page_end = start + PAGE_CACHE_SIZE - 1;
        u64 end;
@@ -2990,7 +2977,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
-               iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+               iosize = ALIGN(iosize, blocksize);
                sector = (em->block_start + extent_offset) >> 9;
                bdev = em->bdev;
                block_start = em->block_start;
@@ -3132,12 +3119,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
                spin_unlock(&eb->refs_lock);
                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-               spin_lock(&fs_info->delalloc_lock);
-               if (fs_info->dirty_metadata_bytes >= eb->len)
-                       fs_info->dirty_metadata_bytes -= eb->len;
-               else
-                       WARN_ON(1);
-               spin_unlock(&fs_info->delalloc_lock);
+               __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+                                    -eb->len,
+                                    fs_info->dirty_metadata_batch);
                ret = 1;
        } else {
                spin_unlock(&eb->refs_lock);
@@ -3454,15 +3438,9 @@ retry:
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
-                       if (tree->ops &&
-                           tree->ops->write_cache_pages_lock_hook) {
-                               tree->ops->write_cache_pages_lock_hook(page,
-                                                              data, flush_fn);
-                       } else {
-                               if (!trylock_page(page)) {
-                                       flush_fn(data);
-                                       lock_page(page);
-                               }
+                       if (!trylock_page(page)) {
+                               flush_fn(data);
+                               lock_page(page);
                        }
 
                        if (unlikely(page->mapping != mapping)) {
@@ -3682,11 +3660,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
                          struct page *page, unsigned long offset)
 {
        struct extent_state *cached_state = NULL;
-       u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
-       start += (offset + blocksize - 1) & ~(blocksize - 1);
+       start += ALIGN(offset, blocksize);
        if (start > end)
                return 0;
 
@@ -3708,7 +3686,7 @@ int try_release_extent_state(struct extent_map_tree *map,
                             struct extent_io_tree *tree, struct page *page,
                             gfp_t mask)
 {
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        int ret = 1;
 
@@ -3747,7 +3725,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                               gfp_t mask)
 {
        struct extent_map *em;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
 
        if ((mask & __GFP_WAIT) &&
@@ -3805,7 +3783,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
                len = last - offset;
                if (len == 0)
                        break;
-               len = (len + sectorsize - 1) & ~(sectorsize - 1);
+               len = ALIGN(len, sectorsize);
                em = get_extent(inode, NULL, 0, offset, len, 0);
                if (IS_ERR_OR_NULL(em))
                        return em;
@@ -4003,8 +3981,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)
        list_del(&eb->leak_list);
        spin_unlock_irqrestore(&leak_lock, flags);
 #endif
-       if (eb->pages && eb->pages != eb->inline_pages)
-               kfree(eb->pages);
        kmem_cache_free(extent_buffer_cache, eb);
 }
 
@@ -4045,19 +4021,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_set(&eb->refs, 1);
        atomic_set(&eb->io_pages, 0);
 
-       if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
-               struct page **pages;
-               int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
-                       PAGE_CACHE_SHIFT;
-               pages = kzalloc(num_pages, mask);
-               if (!pages) {
-                       __free_extent_buffer(eb);
-                       return NULL;
-               }
-               eb->pages = pages;
-       } else {
-               eb->pages = eb->inline_pages;
-       }
+       /*
+        * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+        */
+       BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+               > MAX_INLINE_EXTENT_BUFFER_SIZE);
+       BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
 
        return eb;
 }
@@ -4188,6 +4157,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 
 static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
+       int refs;
        /* the ref bit is tricky.  We have to make sure it is set
         * if we have the buffer dirty.   Otherwise the
         * code to free a buffer can end up dropping a dirty
@@ -4208,6 +4178,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
         * So bump the ref count first, then set the bit.  If someone
         * beat us to it, drop the ref we added.
         */
+       refs = atomic_read(&eb->refs);
+       if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               return;
+
        spin_lock(&eb->refs_lock);
        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
                atomic_inc(&eb->refs);
@@ -4409,9 +4383,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 
 void free_extent_buffer(struct extent_buffer *eb)
 {
+       int refs;
+       int old;
        if (!eb)
                return;
 
+       while (1) {
+               refs = atomic_read(&eb->refs);
+               if (refs <= 3)
+                       break;
+               old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+               if (old == refs)
+                       return;
+       }
+
        spin_lock(&eb->refs_lock);
        if (atomic_read(&eb->refs) == 2 &&
            test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
@@ -4721,10 +4706,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
        }
 
        if (start + min_len > eb->len) {
-               printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+               WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
                       "wanted %lu %lu\n", (unsigned long long)eb->start,
                       eb->len, start, min_len);
-               WARN_ON(1);
                return -EINVAL;
        }