Btrfs: check if extent buffer is aligned to sectorsize

[cascardo/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 84e060e..5439e85 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root,
  {
         u64 end = start + num_bytes - 1;
         set_extent_bits(&root->fs_info->freed_extents[0],
-                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+                       start, end, EXTENT_UPTODATE);
         set_extent_bits(&root->fs_info->freed_extents[1],
-                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+                       start, end, EXTENT_UPTODATE);
         return 0;
  }
  
@@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root,
         end = start + cache->key.offset - 1;
  
         clear_extent_bits(&root->fs_info->freed_extents[0],
-                         start, end, EXTENT_UPTODATE, GFP_NOFS);
+                         start, end, EXTENT_UPTODATE);
         clear_extent_bits(&root->fs_info->freed_extents[1],
-                         start, end, EXTENT_UPTODATE, GFP_NOFS);
+                         start, end, EXTENT_UPTODATE);
  }
  
  static int exclude_super_stripes(struct btrfs_root *root,
@@ -980,7 +980,7 @@ out_free:
   * event that tree block loses its owner tree's reference and do the
   * back refs conversion.
   *
- * When a tree block is COW'd through a tree, there are four cases:
+ * When a tree block is COWed through a tree, there are four cases:
   *
   * The reference count of the block is one and the tree is the block's
   * owner tree. Nothing to do in this case.
@@ -2042,6 +2042,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
         struct btrfs_bio *bbio = NULL;
  
  
+       /*
+        * Avoid races with device replace and make sure our bbio has devices
+        * associated to its stripes that don't go away while we are discarding.
+        */
+       btrfs_bio_counter_inc_blocked(root->fs_info);
         /* Tell the block device(s) that the sectors can be discarded */
         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                               bytenr, &num_bytes, &bbio, 0);
@@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                 }
                 btrfs_put_bbio(bbio);
         }
+       btrfs_bio_counter_dec(root->fs_info);
  
         if (actual_bytes)
                 *actual_bytes = discarded_bytes;
@@ -2595,7 +2601,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                         }
  
                         /*
-                        * Need to drop our head ref lock and re-aqcuire the
+                        * Need to drop our head ref lock and re-acquire the
                          * delayed ref lock and then re-check to make sure
                          * nobody got added.
                          */
@@ -2747,7 +2753,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
  
         /*
          * We don't ever fill up leaves all the way so multiply by 2 just to be
-        * closer to what we're really going to want to ouse.
+        * closer to what we're really going to want to use.
          */
         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
  }
@@ -2851,7 +2857,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
         }
  
         /*
-        * trans->sync means that when we call end_transaciton, we won't
+        * trans->sync means that when we call end_transaction, we won't
          * wait on delayed refs
          */
         trans->sync = true;
@@ -3824,6 +3830,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
         return readonly;
  }
  
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+       struct btrfs_block_group_cache *bg;
+       bool ret = true;
+
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg)
+               return false;
+
+       spin_lock(&bg->lock);
+       if (bg->ro)
+               ret = false;
+       else
+               atomic_inc(&bg->nocow_writers);
+       spin_unlock(&bg->lock);
+
+       /* no put on block group, done by btrfs_dec_nocow_writers */
+       if (!ret)
+               btrfs_put_block_group(bg);
+
+       return ret;
+
+}
+
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+       struct btrfs_block_group_cache *bg;
+
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->nocow_writers))
+               wake_up_atomic_t(&bg->nocow_writers);
+       /*
+        * Once for our lookup and once for the lookup done by a previous call
+        * to btrfs_inc_nocow_writers()
+        */
+       btrfs_put_block_group(bg);
+       btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+{
+       wait_on_atomic_t(&bg->nocow_writers,
+                        btrfs_wait_nocow_writers_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+}
+
  static const char *alloc_name(u64 flags)
  {
         switch (flags) {
@@ -4141,7 +4200,7 @@ commit_trans:
  
                         if (need_commit > 0) {
                                 btrfs_start_delalloc_roots(fs_info, 0, -1);
-                               btrfs_wait_ordered_roots(fs_info, -1);
+                               btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
                         }
  
                         trans = btrfs_join_transaction(root);
@@ -4243,7 +4302,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
   * Called if we need to clear a data reservation for this inode
   * Normally in a error case.
   *
- * This one will handle the per-indoe data rsv map for accurate reserved
+ * This one will handle the per-inode data rsv map for accurate reserved
   * space framework.
   */
  void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
@@ -4583,7 +4642,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                  */
                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
                 if (!current->journal_info)
-                       btrfs_wait_ordered_roots(root->fs_info, nr_items);
+                       btrfs_wait_ordered_roots(root->fs_info, nr_items,
+                                                0, (u64)-1);
         }
  }
  
@@ -4620,7 +4680,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
  
         /* Calc the number of the pages we need flush for space reservation */
         items = calc_reclaim_items_nr(root, to_reclaim);
-       to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+       to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
  
         trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4632,7 +4692,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                 if (trans)
                         return;
                 if (wait_ordered)
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                 return;
         }
  
@@ -4671,7 +4732,8 @@ skip_async:
  
                 loops++;
                 if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                 } else {
                         time_left = schedule_timeout_killable(1);
                         if (time_left)
@@ -4911,7 +4973,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
   * @orig_bytes - the number of bytes we want
   * @flush - whether or not we can flush to make our reservation
   *
- * This will reserve orgi_bytes number of bytes from the space info associated
+ * This will reserve orig_bytes number of bytes from the space info associated
   * with the block_rsv.  If there is not enough space it will make an attempt to
   * flush out space to make room.  It will do this by flushing delalloc if
   * possible or committing the transaction.  If flush is 0 then no attempts to
@@ -5516,7 +5578,7 @@ void btrfs_orphan_release_metadata(struct inode *inode)
   * common file/directory operations, they change two fs/file trees
   * and root tree, the number of items that the qgroup reserves is
   * different with the free space reservation. So we can not use
- * the space reseravtion mechanism in start_transaction().
+ * the space reservation mechanism in start_transaction().
   */
  int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                      struct btrfs_block_rsv *rsv,
@@ -5565,7 +5627,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
  /**
   * drop_outstanding_extent - drop an outstanding extent
   * @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're relaseing.
+ * @num_bytes: the number of bytes we're releasing.
   *
   * This is called when we are freeing up an outstanding extent, either called
   * after an error or after an extent is written.  This will return the number of
@@ -5591,7 +5653,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
                 drop_inode_space = 1;
  
         /*
-        * If we have more or the same amount of outsanding extents than we have
+        * If we have more or the same amount of outstanding extents than we have
          * reserved then we need to leave the reserved extents count alone.
          */
         if (BTRFS_I(inode)->outstanding_extents >=
@@ -5605,8 +5667,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
  }
  
  /**
- * calc_csum_metadata_size - return the amount of metada space that must be
- *     reserved/free'd for the given bytes.
+ * calc_csum_metadata_size - return the amount of metadata space that must be
+ *     reserved/freed for the given bytes.
   * @inode: the inode we're manipulating
   * @num_bytes: the number of bytes in question
   * @reserve: 1 if we are reserving space, 0 if we are freeing space
@@ -5758,7 +5820,7 @@ out_fail:
  
                 /*
                  * This is tricky, but first we need to figure out how much we
-                * free'd from any free-ers that occurred during this
+                * freed from any free-ers that occurred during this
                  * reservation, so we reset ->csum_bytes to the csum_bytes
                  * before we dropped our lock, and then call the free for the
                  * number of bytes that were freed while we were trying our
@@ -5780,7 +5842,7 @@ out_fail:
  
                 /*
                  * Now reset ->csum_bytes to what it should be.  If bytes is
-                * more than to_free then we would have free'd more space had we
+                * more than to_free then we would have freed more space had we
                  * not had an artificially high ->csum_bytes, so we need to free
                  * the remainder.  If bytes is the same or less then we don't
                  * need to do anything, the other free-ers did the correct
@@ -6172,6 +6234,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
         return 0;
  }
  
+static void
+btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+       atomic_inc(&bg->reservations);
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+                                       const u64 start)
+{
+       struct btrfs_block_group_cache *bg;
+
+       bg = btrfs_lookup_block_group(fs_info, start);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->reservations))
+               wake_up_atomic_t(&bg->reservations);
+       btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+       struct btrfs_space_info *space_info = bg->space_info;
+
+       ASSERT(bg->ro);
+
+       if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+               return;
+
+       /*
+        * Our block group is read only but before we set it to read only,
+        * some task might have had allocated an extent from it already, but it
+        * has not yet created a respective ordered extent (and added it to a
+        * root's list of ordered extents).
+        * Therefore wait for any task currently allocating extents, since the
+        * block group's reservations counter is incremented while a read lock
+        * on the groups' semaphore is held and decremented after releasing
+        * the read access on that semaphore and creating the ordered extent.
+        */
+       down_write(&space_info->groups_sem);
+       up_write(&space_info->groups_sem);
+
+       wait_on_atomic_t(&bg->reservations,
+                        btrfs_wait_bg_reservations_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+}
+
  /**
   * btrfs_update_reserved_bytes - update the block_group and space info counters
   * @cache:     The cache we are manipulating
@@ -6408,7 +6521,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                         ret = btrfs_discard_extent(root, start,
                                                    end + 1 - start, NULL);
  
-               clear_extent_dirty(unpin, start, end, GFP_NOFS);
+               clear_extent_dirty(unpin, start, end);
                 unpin_extent_range(root, start, end, true);
                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                 cond_resched();
@@ -7025,36 +7138,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
                    int delalloc)
  {
         struct btrfs_block_group_cache *used_bg = NULL;
-       bool locked = false;
-again:
+
         spin_lock(&cluster->refill_lock);
-       if (locked) {
-               if (used_bg == cluster->block_group)
+       while (1) {
+               used_bg = cluster->block_group;
+               if (!used_bg)
+                       return NULL;
+
+               if (used_bg == block_group)
                         return used_bg;
  
-               up_read(&used_bg->data_rwsem);
-               btrfs_put_block_group(used_bg);
-       }
+               btrfs_get_block_group(used_bg);
  
-       used_bg = cluster->block_group;
-       if (!used_bg)
-               return NULL;
+               if (!delalloc)
+                       return used_bg;
  
-       if (used_bg == block_group)
-               return used_bg;
+               if (down_read_trylock(&used_bg->data_rwsem))
+                       return used_bg;
  
-       btrfs_get_block_group(used_bg);
+               spin_unlock(&cluster->refill_lock);
  
-       if (!delalloc)
-               return used_bg;
+               down_read(&used_bg->data_rwsem);
  
-       if (down_read_trylock(&used_bg->data_rwsem))
-               return used_bg;
+               spin_lock(&cluster->refill_lock);
+               if (used_bg == cluster->block_group)
+                       return used_bg;
  
-       spin_unlock(&cluster->refill_lock);
-       down_read(&used_bg->data_rwsem);
-       locked = true;
-       goto again;
+               up_read(&used_bg->data_rwsem);
+               btrfs_put_block_group(used_bg);
+       }
  }
  
  static inline void
@@ -7431,6 +7543,7 @@ checks:
                         btrfs_add_free_space(block_group, offset, num_bytes);
                         goto loop;
                 }
+               btrfs_inc_block_group_reservations(block_group);
  
                 /* we are all good, lets return */
                 ins->objectid = search_start;
@@ -7471,7 +7584,7 @@ loop:
                 if (loop == LOOP_CACHING_NOWAIT) {
                         /*
                          * We want to skip the LOOP_CACHING_WAIT step if we
-                        * don't have any unached bgs and we've alrelady done a
+                        * don't have any uncached bgs and we've already done a
                          * full search through.
                          */
                         if (orig_have_caching_bg || !full_search)
@@ -7612,8 +7725,10 @@ again:
         WARN_ON(num_bytes < root->sectorsize);
         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
                                flags, delalloc);
-
-       if (ret == -ENOSPC) {
+       if (!ret && !is_data) {
+               btrfs_dec_block_group_reservations(root->fs_info,
+                                                  ins->objectid);
+       } else if (ret == -ENOSPC) {
                 if (!final_tried && ins->offset) {
                         num_bytes = min(num_bytes >> 1, ins->offset);
                         num_bytes = round_down(num_bytes, root->sectorsize);
@@ -7873,7 +7988,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
  
         /*
          * Mixed block groups will exclude before processing the log so we only
-        * need to do the exlude dance if this fs isn't mixed.
+        * need to do the exclude dance if this fs isn't mixed.
          */
         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
@@ -7901,8 +8016,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
         struct extent_buffer *buf;
  
         buf = btrfs_find_create_tree_block(root, bytenr);
-       if (!buf)
-               return ERR_PTR(-ENOMEM);
+       if (IS_ERR(buf))
+               return buf;
+
         btrfs_set_header_generation(buf, trans->transid);
         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
         btrfs_tree_lock(buf);
@@ -7923,7 +8039,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                         buf->start + buf->len - 1, GFP_NOFS);
                 else
                         set_extent_new(&root->dirty_log_pages, buf->start,
-                                       buf->start + buf->len - 1, GFP_NOFS);
+                                       buf->start + buf->len - 1);
         } else {
                 buf->log_index = -1;
                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
@@ -8544,8 +8660,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         next = btrfs_find_tree_block(root->fs_info, bytenr);
         if (!next) {
                 next = btrfs_find_create_tree_block(root, bytenr);
-               if (!next)
-                       return -ENOMEM;
+               if (IS_ERR(next))
+                       return PTR_ERR(next);
+
                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
                                                level - 1);
                 reada = 1;
@@ -9058,7 +9175,7 @@ out:
         if (!for_reloc && root_dropped == false)
                 btrfs_add_dead_root(root);
         if (err && err != -EAGAIN)
-               btrfs_std_error(root->fs_info, err, NULL);
+               btrfs_handle_fs_error(root->fs_info, err, NULL);
         return err;
  }
  
@@ -9317,7 +9434,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
         u64 free_bytes = 0;
         int factor;
  
-       /* It's df, we don't care if it's racey */
+       /* It's df, we don't care if it's racy */
         if (list_empty(&sinfo->ro_bgs))
                 return 0;
  
@@ -10526,14 +10643,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  */
                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
-                                 EXTENT_DIRTY, GFP_NOFS);
+                                 EXTENT_DIRTY);
                 if (ret) {
                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                         btrfs_dec_block_group_ro(root, block_group);
                         goto end_trans;
                 }
                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
-                                 EXTENT_DIRTY, GFP_NOFS);
+                                 EXTENT_DIRTY);
                 if (ret) {
                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                         btrfs_dec_block_group_ro(root, block_group);