Btrfs: fix task hang under heavy compressed write

[cascardo/linux.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 3668048..3d020d6 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -709,6 +709,18 @@ retry:
                                 unlock_extent(io_tree, async_extent->start,
                                               async_extent->start +
                                               async_extent->ram_size - 1);
+
+                               /*
+                                * we need to redirty the pages if we decide to
+                                * fallback to uncompressed IO, otherwise we
+                                * will not submit these pages down to lower
+                                * layers.
+                                */
+                               extent_range_redirty_for_io(inode,
+                                               async_extent->start,
+                                               async_extent->start +
+                                               async_extent->ram_size - 1);
+
                                 goto retry;
                         }
                         goto out_free;
@@ -1084,8 +1096,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                 async_cow->end = cur_end;
                 INIT_LIST_HEAD(&async_cow->extents);
  
-               btrfs_init_work(&async_cow->work, async_cow_start,
-                               async_cow_submit, async_cow_free);
+               btrfs_init_work(&async_cow->work,
+                               btrfs_delalloc_helper,
+                               async_cow_start, async_cow_submit,
+                               async_cow_free);
  
                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                         PAGE_CACHE_SHIFT;
@@ -1869,7 +1883,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
  
         SetPageChecked(page);
         page_cache_get(page);
-       btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+       btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+                       btrfs_writepage_fixup_worker, NULL, NULL);
         fixup->page = page;
         btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
         return -EBUSY;
@@ -2810,7 +2825,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
         struct inode *inode = page->mapping->host;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_ordered_extent *ordered_extent = NULL;
-       struct btrfs_workqueue *workers;
+       struct btrfs_workqueue *wq;
+       btrfs_work_func_t func;
  
         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
  
@@ -2819,13 +2835,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                             end - start + 1, uptodate))
                 return 0;
  
-       btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
+       if (btrfs_is_free_space_inode(inode)) {
+               wq = root->fs_info->endio_freespace_worker;
+               func = btrfs_freespace_write_helper;
+       } else {
+               wq = root->fs_info->endio_write_workers;
+               func = btrfs_endio_write_helper;
+       }
  
-       if (btrfs_is_free_space_inode(inode))
-               workers = root->fs_info->endio_freespace_worker;
-       else
-               workers = root->fs_info->endio_write_workers;
-       btrfs_queue_work(workers, &ordered_extent->work);
+       btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
+                       NULL);
+       btrfs_queue_work(wq, &ordered_extent->work);
  
         return 0;
  }
@@ -4662,6 +4682,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
                 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                 remove_extent_mapping(map_tree, em);
                 free_extent_map(em);
+               if (need_resched()) {
+                       write_unlock(&map_tree->lock);
+                       cond_resched();
+                       write_lock(&map_tree->lock);
+               }
         }
         write_unlock(&map_tree->lock);
  
@@ -4684,6 +4709,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
                                  &cached_state, GFP_NOFS);
                 free_extent_state(state);
  
+               cond_resched();
                 spin_lock(&io_tree->lock);
         }
         spin_unlock(&io_tree->lock);
@@ -5169,6 +5195,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                         iput(inode);
                         inode = ERR_PTR(ret);
                 }
+               /*
+                * If orphan cleanup did remove any orphans, it means the tree
+                * was modified and therefore the commit root is not the same as
+                * the current root anymore. This is a problem, because send
+                * uses the commit root and therefore can see inode items that
+                * don't exist in the current root anymore, and for example make
+                * calls to btrfs_iget, which will do tree lookups based on the
+                * current root and not on the commit root. Those lookups will
+                * fail, returning a -ESTALE error, and making send fail with
+                * that error. So make sure a send does not see any orphans we
+                * have just removed, and that it will see the same inodes
+                * regardless of whether a transaction commit happened before
+                * it started (meaning that the commit root will be the same as
+                * the current root) or not.
+                */
+               if (sub_root->node != sub_root->commit_root) {
+                       u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
+
+                       if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
+                               struct extent_buffer *eb;
+
+                               /*
+                                * Assert we can't have races between dentry
+                                * lookup called through the snapshot creation
+                                * ioctl and the VFS.
+                                */
+                               ASSERT(mutex_is_locked(&dir->i_mutex));
+
+                               down_write(&root->fs_info->commit_root_sem);
+                               eb = sub_root->commit_root;
+                               sub_root->commit_root =
+                                       btrfs_root_node(sub_root);
+                               up_write(&root->fs_info->commit_root_sem);
+                               free_extent_buffer(eb);
+                       }
+               }
         }
  
         return inode;
@@ -5593,6 +5655,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                 return ERR_PTR(-ENOMEM);
         }
  
+       /*
+        * O_TMPFILE, set link count to 0, so that after this point,
+        * we fill in an inode item with the correct link count.
+        */
+       if (!name)
+               set_nlink(inode, 0);
+
         /*
          * we have to initialize this early, so we can reclaim the inode
          * number if we fail afterwards in this function.
@@ -6085,14 +6154,14 @@ out_fail:
  static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                 struct extent_map *existing,
                                 struct extent_map *em,
-                               u64 map_start, u64 map_len)
+                               u64 map_start)
  {
         u64 start_diff;
  
         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
         start_diff = map_start - em->start;
         em->start = map_start;
-       em->len = map_len;
+       em->len = existing->start - em->start;
         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                 em->block_start += start_diff;
@@ -6263,6 +6332,8 @@ next:
                         goto not_found;
                 if (start + len <= found_key.offset)
                         goto not_found;
+               if (start > found_key.offset)
+                       goto next;
                 em->start = start;
                 em->orig_start = start;
                 em->len = found_key.offset - start;
@@ -6378,8 +6449,7 @@ insert:
                                                          em->len);
                         if (existing) {
                                 err = merge_extent_mapping(em_tree, existing,
-                                                          em, start,
-                                                          root->sectorsize);
+                                                          em, start);
                                 free_extent_map(existing);
                                 if (err) {
                                         free_extent_map(em);
@@ -7146,7 +7216,8 @@ again:
         if (!ret)
                 goto out_test;
  
-       btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+       btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+                       finish_ordered_fn, NULL, NULL);
         btrfs_queue_work(root->fs_info->endio_write_workers,
                          &ordered->work);
  out_test:
@@ -7294,10 +7365,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
         map_length = orig_bio->bi_iter.bi_size;
         ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                               &map_length, NULL, 0);
-       if (ret) {
-               bio_put(orig_bio);
+       if (ret)
                 return -EIO;
-       }
  
         if (map_length >= orig_bio->bi_iter.bi_size) {
                 bio = orig_bio;
@@ -7314,6 +7383,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
         if (!bio)
                 return -ENOMEM;
+
         bio->bi_private = dip;
         bio->bi_end_io = btrfs_end_dio_bio;
         atomic_inc(&dip->pending_bios);
@@ -7522,7 +7592,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
         count = iov_iter_count(iter);
         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                      &BTRFS_I(inode)->runtime_flags))
-               filemap_fdatawrite_range(inode->i_mapping, offset, count);
+               filemap_fdatawrite_range(inode->i_mapping, offset,
+                                        offset + count - 1);
  
         if (rw & WRITE) {
                 /*
@@ -7938,27 +8009,6 @@ static int btrfs_truncate(struct inode *inode)
                                       min_size);
         BUG_ON(ret);
  
-       /*
-        * setattr is responsible for setting the ordered_data_close flag,
-        * but that is only tested during the last file release.  That
-        * could happen well after the next commit, leaving a great big
-        * window where new writes may get lost if someone chooses to write
-        * to this file after truncating to zero
-        *
-        * The inode doesn't have any dirty data here, and so if we commit
-        * this is a noop.  If someone immediately starts writing to the inode
-        * it is very likely we'll catch some of their writes in this
-        * transaction, and the commit will find this file on the ordered
-        * data list with good things to send down.
-        *
-        * This is a best effort solution, there is still a window where
-        * using truncate to replace the contents of the file will
-        * end up with a zero length file after a crash.
-        */
-       if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                                          &BTRFS_I(inode)->runtime_flags))
-               btrfs_add_ordered_operation(trans, root, inode);
-
         /*
          * So if we truncate and then write and fsync we normally would just
          * write the extents that changed, which is a problem if we need to
@@ -8106,7 +8156,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         mutex_init(&ei->delalloc_mutex);
         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
         INIT_LIST_HEAD(&ei->delalloc_inodes);
-       INIT_LIST_HEAD(&ei->ordered_operations);
         RB_CLEAR_NODE(&ei->rb_node);
  
         return inode;
@@ -8146,17 +8195,6 @@ void btrfs_destroy_inode(struct inode *inode)
         if (!root)
                 goto free;
  
-       /*
-        * Make sure we're properly removed from the ordered operation
-        * lists.
-        */
-       smp_mb();
-       if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-               spin_lock(&root->fs_info->ordered_root_lock);
-               list_del_init(&BTRFS_I(inode)->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_root_lock);
-       }
-
         if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                      &BTRFS_I(inode)->runtime_flags)) {
                 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8338,12 +8376,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         ret = 0;
  
         /*
-        * we're using rename to replace one file with another.
-        * and the replacement file is large.  Start IO on it now so
-        * we don't add too much work to the end of the transaction
+        * we're using rename to replace one file with another.  Start IO on it
+        * now so  we don't add too much work to the end of the transaction
          */
-       if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
-           old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+       if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
                 filemap_flush(old_inode->i_mapping);
  
         /* close the racy window with snapshot create/destroy ioctl */
@@ -8391,12 +8427,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                  */
                 btrfs_pin_log_trans(root);
         }
-       /*
-        * make sure the inode gets flushed if it is replacing
-        * something.
-        */
-       if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-               btrfs_add_ordered_operation(trans, root, old_inode);
  
         inode_inc_iversion(old_dir);
         inode_inc_iversion(new_dir);
@@ -8514,7 +8544,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
         work->inode = inode;
         work->wait = wait;
         work->delay_iput = delay_iput;
-       btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+       WARN_ON_ONCE(!inode);
+       btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+                       btrfs_run_delalloc_work, NULL, NULL);
  
         return work;
  }
@@ -8998,6 +9030,14 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
         if (ret)
                 goto out;
  
+       /*
+        * We set number of links to 0 in btrfs_new_inode(), and here we set
+        * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+        * through:
+        *
+        *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+        */
+       set_nlink(inode, 1);
         d_tmpfile(dentry, inode);
         mark_inode_dirty(inode);