Btrfs: check if extent buffer is aligned to sectorsize
[cascardo/linux.git] / fs / btrfs / extent-tree.c
index 84e060e..5439e85 100644 (file)
@@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root,
 {
        u64 end = start + num_bytes - 1;
        set_extent_bits(&root->fs_info->freed_extents[0],
-                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+                       start, end, EXTENT_UPTODATE);
        set_extent_bits(&root->fs_info->freed_extents[1],
-                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+                       start, end, EXTENT_UPTODATE);
        return 0;
 }
 
@@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root,
        end = start + cache->key.offset - 1;
 
        clear_extent_bits(&root->fs_info->freed_extents[0],
-                         start, end, EXTENT_UPTODATE, GFP_NOFS);
+                         start, end, EXTENT_UPTODATE);
        clear_extent_bits(&root->fs_info->freed_extents[1],
-                         start, end, EXTENT_UPTODATE, GFP_NOFS);
+                         start, end, EXTENT_UPTODATE);
 }
 
 static int exclude_super_stripes(struct btrfs_root *root,
@@ -980,7 +980,7 @@ out_free:
  * event that tree block loses its owner tree's reference and do the
  * back refs conversion.
  *
- * When a tree block is COW'd through a tree, there are four cases:
+ * When a tree block is COWed through a tree, there are four cases:
  *
  * The reference count of the block is one and the tree is the block's
  * owner tree. Nothing to do in this case.
@@ -2042,6 +2042,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        struct btrfs_bio *bbio = NULL;
 
 
+       /*
+        * Avoid races with device replace and make sure our bbio has devices
+        * associated to its stripes that don't go away while we are discarding.
+        */
+       btrfs_bio_counter_inc_blocked(root->fs_info);
        /* Tell the block device(s) that the sectors can be discarded */
        ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                              bytenr, &num_bytes, &bbio, 0);
@@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                }
                btrfs_put_bbio(bbio);
        }
+       btrfs_bio_counter_dec(root->fs_info);
 
        if (actual_bytes)
                *actual_bytes = discarded_bytes;
@@ -2595,7 +2601,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                        }
 
                        /*
-                        * Need to drop our head ref lock and re-aqcuire the
+                        * Need to drop our head ref lock and re-acquire the
                         * delayed ref lock and then re-check to make sure
                         * nobody got added.
                         */
@@ -2747,7 +2753,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 
        /*
         * We don't ever fill up leaves all the way so multiply by 2 just to be
-        * closer to what we're really going to want to ouse.
+        * closer to what we're really going to want to use.
         */
        return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
@@ -2851,7 +2857,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
        }
 
        /*
-        * trans->sync means that when we call end_transaciton, we won't
+        * trans->sync means that when we call end_transaction, we won't
         * wait on delayed refs
         */
        trans->sync = true;
@@ -3824,6 +3830,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
        return readonly;
 }
 
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+       struct btrfs_block_group_cache *bg;
+       bool ret = true;
+
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg)
+               return false;
+
+       spin_lock(&bg->lock);
+       if (bg->ro)
+               ret = false;
+       else
+               atomic_inc(&bg->nocow_writers);
+       spin_unlock(&bg->lock);
+
+       /* no put on block group, done by btrfs_dec_nocow_writers */
+       if (!ret)
+               btrfs_put_block_group(bg);
+
+       return ret;
+
+}
+
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+       struct btrfs_block_group_cache *bg;
+
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->nocow_writers))
+               wake_up_atomic_t(&bg->nocow_writers);
+       /*
+        * Once for our lookup and once for the lookup done by a previous call
+        * to btrfs_inc_nocow_writers()
+        */
+       btrfs_put_block_group(bg);
+       btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+{
+       wait_on_atomic_t(&bg->nocow_writers,
+                        btrfs_wait_nocow_writers_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+}
+
 static const char *alloc_name(u64 flags)
 {
        switch (flags) {
@@ -4141,7 +4200,7 @@ commit_trans:
 
                        if (need_commit > 0) {
                                btrfs_start_delalloc_roots(fs_info, 0, -1);
-                               btrfs_wait_ordered_roots(fs_info, -1);
+                               btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
                        }
 
                        trans = btrfs_join_transaction(root);
@@ -4243,7 +4302,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
  * Called if we need to clear a data reservation for this inode
  * Normally in a error case.
  *
- * This one will handle the per-indoe data rsv map for accurate reserved
+ * This one will handle the per-inode data rsv map for accurate reserved
  * space framework.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
@@ -4583,7 +4642,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                 */
                btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
                if (!current->journal_info)
-                       btrfs_wait_ordered_roots(root->fs_info, nr_items);
+                       btrfs_wait_ordered_roots(root->fs_info, nr_items,
+                                                0, (u64)-1);
        }
 }
 
@@ -4620,7 +4680,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
        /* Calc the number of the pages we need flush for space reservation */
        items = calc_reclaim_items_nr(root, to_reclaim);
-       to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+       to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4632,7 +4692,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                if (trans)
                        return;
                if (wait_ordered)
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                return;
        }
 
@@ -4671,7 +4732,8 @@ skip_async:
 
                loops++;
                if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                } else {
                        time_left = schedule_timeout_killable(1);
                        if (time_left)
@@ -4911,7 +4973,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
  * @orig_bytes - the number of bytes we want
  * @flush - whether or not we can flush to make our reservation
  *
- * This will reserve orgi_bytes number of bytes from the space info associated
+ * This will reserve orig_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
  * flush out space to make room.  It will do this by flushing delalloc if
  * possible or committing the transaction.  If flush is 0 then no attempts to
@@ -5516,7 +5578,7 @@ void btrfs_orphan_release_metadata(struct inode *inode)
  * common file/directory operations, they change two fs/file trees
  * and root tree, the number of items that the qgroup reserves is
  * different with the free space reservation. So we can not use
- * the space reseravtion mechanism in start_transaction().
+ * the space reservation mechanism in start_transaction().
  */
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                     struct btrfs_block_rsv *rsv,
@@ -5565,7 +5627,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 /**
  * drop_outstanding_extent - drop an outstanding extent
  * @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're relaseing.
+ * @num_bytes: the number of bytes we're releasing.
  *
  * This is called when we are freeing up an outstanding extent, either called
  * after an error or after an extent is written.  This will return the number of
@@ -5591,7 +5653,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
                drop_inode_space = 1;
 
        /*
-        * If we have more or the same amount of outsanding extents than we have
+        * If we have more or the same amount of outstanding extents than we have
         * reserved then we need to leave the reserved extents count alone.
         */
        if (BTRFS_I(inode)->outstanding_extents >=
@@ -5605,8 +5667,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
 }
 
 /**
- * calc_csum_metadata_size - return the amount of metada space that must be
- *     reserved/free'd for the given bytes.
+ * calc_csum_metadata_size - return the amount of metadata space that must be
+ *     reserved/freed for the given bytes.
  * @inode: the inode we're manipulating
  * @num_bytes: the number of bytes in question
  * @reserve: 1 if we are reserving space, 0 if we are freeing space
@@ -5758,7 +5820,7 @@ out_fail:
 
                /*
                 * This is tricky, but first we need to figure out how much we
-                * free'd from any free-ers that occurred during this
+                * freed from any free-ers that occurred during this
                 * reservation, so we reset ->csum_bytes to the csum_bytes
                 * before we dropped our lock, and then call the free for the
                 * number of bytes that were freed while we were trying our
@@ -5780,7 +5842,7 @@ out_fail:
 
                /*
                 * Now reset ->csum_bytes to what it should be.  If bytes is
-                * more than to_free then we would have free'd more space had we
+                * more than to_free then we would have freed more space had we
                 * not had an artificially high ->csum_bytes, so we need to free
                 * the remainder.  If bytes is the same or less then we don't
                 * need to do anything, the other free-ers did the correct
@@ -6172,6 +6234,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
        return 0;
 }
 
+static void
+btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+       atomic_inc(&bg->reservations);
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+                                       const u64 start)
+{
+       struct btrfs_block_group_cache *bg;
+
+       bg = btrfs_lookup_block_group(fs_info, start);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->reservations))
+               wake_up_atomic_t(&bg->reservations);
+       btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+       struct btrfs_space_info *space_info = bg->space_info;
+
+       ASSERT(bg->ro);
+
+       if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+               return;
+
+       /*
+        * Our block group is read only but before we set it to read only,
+        * some task might have had allocated an extent from it already, but it
+        * has not yet created a respective ordered extent (and added it to a
+        * root's list of ordered extents).
+        * Therefore wait for any task currently allocating extents, since the
+        * block group's reservations counter is incremented while a read lock
+        * on the groups' semaphore is held and decremented after releasing
+        * the read access on that semaphore and creating the ordered extent.
+        */
+       down_write(&space_info->groups_sem);
+       up_write(&space_info->groups_sem);
+
+       wait_on_atomic_t(&bg->reservations,
+                        btrfs_wait_bg_reservations_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+}
+
 /**
  * btrfs_update_reserved_bytes - update the block_group and space info counters
  * @cache:     The cache we are manipulating
@@ -6408,7 +6521,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                        ret = btrfs_discard_extent(root, start,
                                                   end + 1 - start, NULL);
 
-               clear_extent_dirty(unpin, start, end, GFP_NOFS);
+               clear_extent_dirty(unpin, start, end);
                unpin_extent_range(root, start, end, true);
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                cond_resched();
@@ -7025,36 +7138,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
                   int delalloc)
 {
        struct btrfs_block_group_cache *used_bg = NULL;
-       bool locked = false;
-again:
+
        spin_lock(&cluster->refill_lock);
-       if (locked) {
-               if (used_bg == cluster->block_group)
+       while (1) {
+               used_bg = cluster->block_group;
+               if (!used_bg)
+                       return NULL;
+
+               if (used_bg == block_group)
                        return used_bg;
 
-               up_read(&used_bg->data_rwsem);
-               btrfs_put_block_group(used_bg);
-       }
+               btrfs_get_block_group(used_bg);
 
-       used_bg = cluster->block_group;
-       if (!used_bg)
-               return NULL;
+               if (!delalloc)
+                       return used_bg;
 
-       if (used_bg == block_group)
-               return used_bg;
+               if (down_read_trylock(&used_bg->data_rwsem))
+                       return used_bg;
 
-       btrfs_get_block_group(used_bg);
+               spin_unlock(&cluster->refill_lock);
 
-       if (!delalloc)
-               return used_bg;
+               down_read(&used_bg->data_rwsem);
 
-       if (down_read_trylock(&used_bg->data_rwsem))
-               return used_bg;
+               spin_lock(&cluster->refill_lock);
+               if (used_bg == cluster->block_group)
+                       return used_bg;
 
-       spin_unlock(&cluster->refill_lock);
-       down_read(&used_bg->data_rwsem);
-       locked = true;
-       goto again;
+               up_read(&used_bg->data_rwsem);
+               btrfs_put_block_group(used_bg);
+       }
 }
 
 static inline void
@@ -7431,6 +7543,7 @@ checks:
                        btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
                }
+               btrfs_inc_block_group_reservations(block_group);
 
                /* we are all good, lets return */
                ins->objectid = search_start;
@@ -7471,7 +7584,7 @@ loop:
                if (loop == LOOP_CACHING_NOWAIT) {
                        /*
                         * We want to skip the LOOP_CACHING_WAIT step if we
-                        * don't have any unached bgs and we've alrelady done a
+                        * don't have any uncached bgs and we've already done a
                         * full search through.
                         */
                        if (orig_have_caching_bg || !full_search)
@@ -7612,8 +7725,10 @@ again:
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
                               flags, delalloc);
-
-       if (ret == -ENOSPC) {
+       if (!ret && !is_data) {
+               btrfs_dec_block_group_reservations(root->fs_info,
+                                                  ins->objectid);
+       } else if (ret == -ENOSPC) {
                if (!final_tried && ins->offset) {
                        num_bytes = min(num_bytes >> 1, ins->offset);
                        num_bytes = round_down(num_bytes, root->sectorsize);
@@ -7873,7 +7988,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 
        /*
         * Mixed block groups will exclude before processing the log so we only
-        * need to do the exlude dance if this fs isn't mixed.
+        * need to do the exclude dance if this fs isn't mixed.
         */
        if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
                ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
@@ -7901,8 +8016,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        struct extent_buffer *buf;
 
        buf = btrfs_find_create_tree_block(root, bytenr);
-       if (!buf)
-               return ERR_PTR(-ENOMEM);
+       if (IS_ERR(buf))
+               return buf;
+
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
@@ -7923,7 +8039,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                        buf->start + buf->len - 1, GFP_NOFS);
                else
                        set_extent_new(&root->dirty_log_pages, buf->start,
-                                       buf->start + buf->len - 1, GFP_NOFS);
+                                       buf->start + buf->len - 1);
        } else {
                buf->log_index = -1;
                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
@@ -8544,8 +8660,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        next = btrfs_find_tree_block(root->fs_info, bytenr);
        if (!next) {
                next = btrfs_find_create_tree_block(root, bytenr);
-               if (!next)
-                       return -ENOMEM;
+               if (IS_ERR(next))
+                       return PTR_ERR(next);
+
                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
                                               level - 1);
                reada = 1;
@@ -9058,7 +9175,7 @@ out:
        if (!for_reloc && root_dropped == false)
                btrfs_add_dead_root(root);
        if (err && err != -EAGAIN)
-               btrfs_std_error(root->fs_info, err, NULL);
+               btrfs_handle_fs_error(root->fs_info, err, NULL);
        return err;
 }
 
@@ -9317,7 +9434,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
        u64 free_bytes = 0;
        int factor;
 
-       /* It's df, we don't care if it's racey */
+       /* It's df, we don't care if it's racy */
        if (list_empty(&sinfo->ro_bgs))
                return 0;
 
@@ -10526,14 +10643,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 */
                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
-                                 EXTENT_DIRTY, GFP_NOFS);
+                                 EXTENT_DIRTY);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_dec_block_group_ro(root, block_group);
                        goto end_trans;
                }
                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
-                                 EXTENT_DIRTY, GFP_NOFS);
+                                 EXTENT_DIRTY);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_dec_block_group_ro(root, block_group);