Merge branch 'for-john' of git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211
[cascardo/linux.git] / fs / btrfs / extent-tree.c
index 6e1d367..4e1b153 100644 (file)
@@ -34,6 +34,8 @@
 #include "locking.h"
 #include "free-space-cache.h"
 
+#undef SCRAMBLE_DELAYED_REFS
+
 /*
  * control flags for do_chunk_alloc's force field
  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
@@ -2217,6 +2219,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        int count = 0;
        int must_insert_reserved = 0;
@@ -2255,7 +2258,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref = select_delayed_ref(locked_ref);
 
                if (ref && ref->seq &&
-                   btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                   btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
                        /*
                         * there are still refs with lower seq numbers in the
                         * process of being added. Don't run this ref yet.
@@ -2337,7 +2340,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
 
 next:
-               do_chunk_alloc(trans, root->fs_info->extent_root,
+               do_chunk_alloc(trans, fs_info->extent_root,
                               2 * 1024 * 1024,
                               btrfs_get_alloc_profile(root, 0),
                               CHUNK_ALLOC_NO_FORCE);
@@ -2347,21 +2350,99 @@ next:
        return count;
 }
 
-static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
+                              struct btrfs_delayed_ref_root *delayed_refs,
                               unsigned long num_refs,
                               struct list_head *first_seq)
 {
        spin_unlock(&delayed_refs->lock);
        pr_debug("waiting for more refs (num %ld, first %p)\n",
                 num_refs, first_seq);
-       wait_event(delayed_refs->seq_wait,
+       wait_event(fs_info->tree_mod_seq_wait,
                   num_refs != delayed_refs->num_entries ||
-                  delayed_refs->seq_head.next != first_seq);
+                  fs_info->tree_mod_seq_list.next != first_seq);
        pr_debug("done waiting for more refs (num %ld, first %p)\n",
-                delayed_refs->num_entries, delayed_refs->seq_head.next);
+                delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
        spin_lock(&delayed_refs->lock);
 }
 
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+       struct rb_node *n = root->rb_node;
+       struct btrfs_delayed_ref_node *entry;
+       int alt = 1;
+       u64 middle;
+       u64 first = 0, last = 0;
+
+       n = rb_first(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               first = entry->bytenr;
+       }
+       n = rb_last(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               last = entry->bytenr;
+       }
+       n = root->rb_node;
+
+       while (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               WARN_ON(!entry->in_tree);
+
+               middle = entry->bytenr;
+
+               if (alt)
+                       n = n->rb_left;
+               else
+                       n = n->rb_right;
+
+               alt = 1 - alt;
+       }
+       return middle;
+}
+#endif
+
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info)
+{
+       struct qgroup_update *qgroup_update;
+       int ret = 0;
+
+       if (list_empty(&trans->qgroup_ref_list) !=
+           !trans->delayed_ref_elem.seq) {
+               /* list without seq or seq without list */
+               printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+                       list_empty(&trans->qgroup_ref_list) ? "" : " not",
+                       trans->delayed_ref_elem.seq);
+               BUG();
+       }
+
+       if (!trans->delayed_ref_elem.seq)
+               return 0;
+
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+                                                struct qgroup_update, list);
+               list_del(&qgroup_update->list);
+               if (!ret)
+                       ret = btrfs_qgroup_account_ref(
+                                       trans, fs_info, qgroup_update->node,
+                                       qgroup_update->extent_op);
+               kfree(qgroup_update);
+       }
+
+       btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+       return ret;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2398,11 +2479,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
                       CHUNK_ALLOC_NO_FORCE);
 
+       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
        consider_waiting = 0;
        spin_lock(&delayed_refs->lock);
+
+#ifdef SCRAMBLE_DELAYED_REFS
+       delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
                run_most = 1;
@@ -2437,7 +2525,7 @@ again:
                                num_refs = delayed_refs->num_entries;
                                first_seq = root->fs_info->tree_mod_seq_list.next;
                        } else {
-                               wait_for_more_refs(delayed_refs,
+                               wait_for_more_refs(root->fs_info, delayed_refs,
                                                   num_refs, first_seq);
                                /*
                                 * after waiting, things have changed. we
@@ -2502,6 +2590,7 @@ again:
        }
 out:
        spin_unlock(&delayed_refs->lock);
+       assert_qgroups_uptodate(trans);
        return 0;
 }
 
@@ -2581,8 +2670,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 
        node = rb_prev(node);
        if (node) {
+               int seq = ref->seq;
+
                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-               if (ref->bytenr == bytenr)
+               if (ref->bytenr == bytenr && ref->seq == seq)
                        goto out_unlock;
        }
 
@@ -2903,8 +2994,13 @@ again:
        }
 
        spin_lock(&block_group->lock);
-       if (block_group->cached != BTRFS_CACHE_FINISHED) {
-               /* We're not cached, don't bother trying to write stuff out */
+       if (block_group->cached != BTRFS_CACHE_FINISHED ||
+           !btrfs_test_opt(root, SPACE_CACHE)) {
+               /*
+                * don't bother trying to write stuff out _if_
+                * a) we're not cached,
+                * b) we're with nospace_cache mount option.
+                */
                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
@@ -3134,6 +3230,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        init_waitqueue_head(&found->wait);
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               info->data_sinfo = found;
        return 0;
 }
 
@@ -3263,12 +3361,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        return get_alloc_profile(root, flags);
 }
 
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-       BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                      BTRFS_BLOCK_GROUP_DATA);
-}
-
 /*
  * This will check the space that the inode allocates from to make sure we have
  * enough space for bytes.
@@ -3277,6 +3369,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        u64 used;
        int ret = 0, committed = 0, alloc_chunk = 1;
 
@@ -3289,7 +3382,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
                committed = 1;
        }
 
-       data_sinfo = BTRFS_I(inode)->space_info;
+       data_sinfo = fs_info->data_sinfo;
        if (!data_sinfo)
                goto alloc;
 
@@ -3330,10 +3423,9 @@ alloc:
                                        goto commit_trans;
                        }
 
-                       if (!data_sinfo) {
-                               btrfs_set_inode_space_info(root, inode);
-                               data_sinfo = BTRFS_I(inode)->space_info;
-                       }
+                       if (!data_sinfo)
+                               data_sinfo = fs_info->data_sinfo;
+
                        goto again;
                }
 
@@ -3380,7 +3472,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
-       data_sinfo = BTRFS_I(inode)->space_info;
+       data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
@@ -3586,89 +3678,58 @@ out:
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
-                          bool wait_ordered)
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+                           bool wait_ordered)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
-       u64 reserved;
+       u64 delalloc_bytes;
        u64 max_reclaim;
-       u64 reclaimed = 0;
        long time_left;
        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
-       unsigned long progress;
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
 
        smp_mb();
-       reserved = space_info->bytes_may_use;
-       progress = space_info->reservation_progress;
-
-       if (reserved == 0)
-               return 0;
-
-       smp_mb();
-       if (root->fs_info->delalloc_bytes == 0) {
+       delalloc_bytes = root->fs_info->delalloc_bytes;
+       if (delalloc_bytes == 0) {
                if (trans)
-                       return 0;
+                       return;
                btrfs_wait_ordered_extents(root, 0, 0);
-               return 0;
+               return;
        }
 
-       max_reclaim = min(reserved, to_reclaim);
-       nr_pages = max_t(unsigned long, nr_pages,
-                        max_reclaim >> PAGE_CACHE_SHIFT);
-       while (loops < 1024) {
-               /* have the flusher threads jump in and do some IO */
-               smp_mb();
-               nr_pages = min_t(unsigned long, nr_pages,
-                      root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+       while (delalloc_bytes && loops < 3) {
+               max_reclaim = min(delalloc_bytes, to_reclaim);
+               nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-                                               WB_REASON_FS_FREE_SPACE);
+                                              WB_REASON_FS_FREE_SPACE);
 
                spin_lock(&space_info->lock);
-               if (reserved > space_info->bytes_may_use)
-                       reclaimed += reserved - space_info->bytes_may_use;
-               reserved = space_info->bytes_may_use;
+               if (space_info->bytes_used + space_info->bytes_reserved +
+                   space_info->bytes_pinned + space_info->bytes_readonly +
+                   space_info->bytes_may_use + orig <=
+                   space_info->total_bytes) {
+                       spin_unlock(&space_info->lock);
+                       break;
+               }
                spin_unlock(&space_info->lock);
 
                loops++;
-
-               if (reserved == 0 || reclaimed >= max_reclaim)
-                       break;
-
-               if (trans && trans->transaction->blocked)
-                       return -EAGAIN;
-
                if (wait_ordered && !trans) {
                        btrfs_wait_ordered_extents(root, 0, 0);
                } else {
-                       time_left = schedule_timeout_interruptible(1);
-
-                       /* We were interrupted, exit */
+                       time_left = schedule_timeout_killable(1);
                        if (time_left)
                                break;
                }
-
-               /* we've kicked the IO a few times, if anything has been freed,
-                * exit.  There is no sense in looping here for a long time
-                * when we really need to commit the transaction, or there are
-                * just too many writers without enough free space
-                */
-
-               if (loops > 3) {
-                       smp_mb();
-                       if (progress != space_info->reservation_progress)
-                               break;
-               }
-
+               smp_mb();
+               delalloc_bytes = root->fs_info->delalloc_bytes;
        }
-
-       return reclaimed >= to_reclaim;
 }
 
 /**
@@ -3728,6 +3789,58 @@ commit:
        return btrfs_commit_transaction(trans, root);
 }
 
+enum flush_state {
+       FLUSH_DELALLOC          =       1,
+       FLUSH_DELALLOC_WAIT     =       2,
+       FLUSH_DELAYED_ITEMS_NR  =       3,
+       FLUSH_DELAYED_ITEMS     =       4,
+       COMMIT_TRANS            =       5,
+};
+
+static int flush_space(struct btrfs_root *root,
+                      struct btrfs_space_info *space_info, u64 num_bytes,
+                      u64 orig_bytes, int state)
+{
+       struct btrfs_trans_handle *trans;
+       int nr;
+       int ret = 0;
+
+       switch (state) {
+       case FLUSH_DELALLOC:
+       case FLUSH_DELALLOC_WAIT:
+               shrink_delalloc(root, num_bytes, orig_bytes,
+                               state == FLUSH_DELALLOC_WAIT);
+               break;
+       case FLUSH_DELAYED_ITEMS_NR:
+       case FLUSH_DELAYED_ITEMS:
+               if (state == FLUSH_DELAYED_ITEMS_NR) {
+                       u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+                       nr = (int)div64_u64(num_bytes, bytes);
+                       if (!nr)
+                               nr = 1;
+                       nr *= 2;
+               } else {
+                       nr = -1;
+               }
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               ret = btrfs_run_delayed_items_nr(trans, root, nr);
+               btrfs_end_transaction(trans, root);
+               break;
+       case COMMIT_TRANS:
+               ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+               break;
+       default:
+               ret = -ENOSPC;
+               break;
+       }
+
+       return ret;
+}
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
  * @root - the root we're allocating for
@@ -3749,11 +3862,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
        u64 num_bytes = orig_bytes;
-       int retries = 0;
+       int flush_state = FLUSH_DELALLOC;
        int ret = 0;
-       bool committed = false;
        bool flushing = false;
-       bool wait_ordered = false;
+       bool committed = false;
 
 again:
        ret = 0;
@@ -3812,9 +3924,8 @@ again:
                 * amount plus the amount of bytes that we need for this
                 * reservation.
                 */
-               wait_ordered = true;
                num_bytes = used - space_info->total_bytes +
-                       (orig_bytes * (retries + 1));
+                       (orig_bytes * 2);
        }
 
        if (ret) {
@@ -3867,8 +3978,6 @@ again:
                        trace_btrfs_space_reservation(root->fs_info,
                                "space_info", space_info->flags, orig_bytes, 1);
                        ret = 0;
-               } else {
-                       wait_ordered = true;
                }
        }
 
@@ -3887,36 +3996,13 @@ again:
        if (!ret || !flush)
                goto out;
 
-       /*
-        * We do synchronous shrinking since we don't actually unreserve
-        * metadata until after the IO is completed.
-        */
-       ret = shrink_delalloc(root, num_bytes, wait_ordered);
-       if (ret < 0)
-               goto out;
-
-       ret = 0;
-
-       /*
-        * So if we were overcommitted it's possible that somebody else flushed
-        * out enough space and we simply didn't have enough space to reclaim,
-        * so go back around and try again.
-        */
-       if (retries < 2) {
-               wait_ordered = true;
-               retries++;
+       ret = flush_space(root, space_info, num_bytes, orig_bytes,
+                         flush_state);
+       flush_state++;
+       if (!ret)
                goto again;
-       }
-
-       ret = -ENOSPC;
-       if (committed)
-               goto out;
-
-       ret = may_commit_transaction(root, space_info, orig_bytes, 0);
-       if (!ret) {
-               committed = true;
+       else if (flush_state <= COMMIT_TRANS)
                goto again;
-       }
 
 out:
        if (flushing) {
@@ -3934,7 +4020,10 @@ static struct btrfs_block_rsv *get_block_rsv(
 {
        struct btrfs_block_rsv *block_rsv = NULL;
 
-       if (root->ref_cows || root == root->fs_info->csum_root)
+       if (root->ref_cows)
+               block_rsv = trans->block_rsv;
+
+       if (root == root->fs_info->csum_root && trans->adding_csums)
                block_rsv = trans->block_rsv;
 
        if (!block_rsv)
@@ -4286,6 +4375,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
+       if (!trans->block_rsv)
+               return;
+
        if (!trans->bytes_reserved)
                return;
 
@@ -4444,7 +4536,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        int ret;
 
        /* Need to be holding the i_mutex here if we aren't free space cache */
-       if (btrfs_is_free_space_inode(root, inode))
+       if (btrfs_is_free_space_inode(inode))
                flush = 0;
 
        if (flush && btrfs_transaction_in_commit(root->fs_info))
@@ -4476,6 +4568,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
 
+       if (root->fs_info->quota_enabled) {
+               ret = btrfs_qgroup_reserve(root, num_bytes +
+                                          nr_extents * root->leafsize);
+               if (ret)
+                       return ret;
+       }
+
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (ret) {
                u64 to_free = 0;
@@ -4554,6 +4653,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
+       if (root->fs_info->quota_enabled) {
+               btrfs_qgroup_free(root, num_bytes +
+                                       dropped * root->leafsize);
+       }
+
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
 }
@@ -5190,8 +5294,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        rb_erase(&head->node.rb_node, &delayed_refs->root);
 
        delayed_refs->num_entries--;
-       if (waitqueue_active(&delayed_refs->seq_wait))
-               wake_up(&delayed_refs->seq_wait);
+       smp_mb();
+       if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
+               wake_up(&root->fs_info->tree_mod_seq_wait);
 
        /*
         * we don't take a ref on the node because we're removing it from the
@@ -5748,7 +5853,11 @@ loop:
                                ret = do_chunk_alloc(trans, root, num_bytes +
                                                     2 * 1024 * 1024, data,
                                                     CHUNK_ALLOC_LIMITED);
-                               if (ret < 0) {
+                               /*
+                                * Do not bail out on ENOSPC since we
+                                * can do more things.
+                                */
+                               if (ret < 0 && ret != -ENOSPC) {
                                        btrfs_abort_transaction(trans,
                                                                root, ret);
                                        goto out;
@@ -5816,13 +5925,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 again:
        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
-               printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
-                      "%llu pinned %llu reserved\n",
+               printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
                       (unsigned long long)cache->key.objectid,
                       (unsigned long long)cache->key.offset,
                       (unsigned long long)btrfs_block_group_used(&cache->item),
                       (unsigned long long)cache->pinned,
-                      (unsigned long long)cache->reserved);
+                      (unsigned long long)cache->reserved,
+                      cache->ro ? "[readonly]" : "");
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
@@ -7610,8 +7719,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
 
-               if (need_clear)
+               if (need_clear) {
+                       /*
+                        * When we mount with old space cache, we need to
+                        * set BTRFS_DC_CLEAR and set dirty flag.
+                        *
+                        * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+                        *    truncate the old free space cache inode and
+                        *    setup a new one.
+                        * b) Setting 'dirty flag' makes sure that we flush
+                        *    the new space cache info onto disk.
+                        */
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
+                       if (btrfs_test_opt(root, SPACE_CACHE))
+                               cache->dirty = 1;
+               }
 
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),