fs/btrfs/extent-tree.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/writeback.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/sort.h>
  23 #include <linux/rcupdate.h>
  24 #include <linux/kthread.h>
  25 #include <linux/slab.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/percpu_counter.h>
  28 #include "hash.h"
  29 #include "tree-log.h"
  30 #include "disk-io.h"
  31 #include "print-tree.h"
  32 #include "volumes.h"
  33 #include "raid56.h"
  34 #include "locking.h"
  35 #include "free-space-cache.h"
  36 #include "free-space-tree.h"
  37 #include "math.h"
  38 #include "sysfs.h"
  39 #include "qgroup.h"
  40
  41 #undef SCRAMBLE_DELAYED_REFS
  42
  43 /*
  44  * control flags for do_chunk_alloc's force field
  45  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  46  * if we really need one.
  47  *
  48  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  49  * if we have very few chunks already allocated.  This is
  50  * used as part of the clustering code to help make sure
  51  * we have a good pool of storage to cluster in, without
  52  * filling the FS with empty chunks
  53  *
  54  * CHUNK_ALLOC_FORCE means it must try to allocate one
  55  *
  56  */
  57 enum {
  58         CHUNK_ALLOC_NO_FORCE = 0,
  59         CHUNK_ALLOC_LIMITED = 1,
  60         CHUNK_ALLOC_FORCE = 2,
  61 };
  62
  63 /*
  64  * Control how reservations are dealt with.
  65  *
  66  * RESERVE_FREE - freeing a reservation.
  67  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  68  *   ENOSPC accounting
  69  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  70  *   bytes_may_use as the ENOSPC accounting is done elsewhere
  71  */
  72 enum {
  73         RESERVE_FREE = 0,
  74         RESERVE_ALLOC = 1,
  75         RESERVE_ALLOC_NO_ACCOUNT = 2,
  76 };
  77
  78 static int update_block_group(struct btrfs_trans_handle *trans,
  79                               struct btrfs_root *root, u64 bytenr,
  80                               u64 num_bytes, int alloc);
  81 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  82                                 struct btrfs_root *root,
  83                                 struct btrfs_delayed_ref_node *node, u64 parent,
  84                                 u64 root_objectid, u64 owner_objectid,
  85                                 u64 owner_offset, int refs_to_drop,
  86                                 struct btrfs_delayed_extent_op *extra_op);
  87 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  88                                     struct extent_buffer *leaf,
  89                                     struct btrfs_extent_item *ei);
  90 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  91                                       struct btrfs_root *root,
  92                                       u64 parent, u64 root_objectid,
  93                                       u64 flags, u64 owner, u64 offset,
  94                                       struct btrfs_key *ins, int ref_mod);
  95 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  96                                      struct btrfs_root *root,
  97                                      u64 parent, u64 root_objectid,
  98                                      u64 flags, struct btrfs_disk_key *key,
  99                                      int level, struct btrfs_key *ins);
 100 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 101                           struct btrfs_root *extent_root, u64 flags,
 102                           int force);
 103 static int find_next_key(struct btrfs_path *path, int level,
 104                          struct btrfs_key *key);
 105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 106                             int dump_block_groups);
 107 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
 108                                     u64 num_bytes, int reserve, int delalloc);
 109 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
 110                                      u64 num_bytes, int delalloc);
 111 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 112                                u64 num_bytes);
 113 int btrfs_pin_extent(struct btrfs_root *root,
 114                      u64 bytenr, u64 num_bytes, int reserved);
 115 static int __reserve_metadata_bytes(struct btrfs_root *root,
 116                                     struct btrfs_space_info *space_info,
 117                                     u64 orig_bytes,
 118                                     enum btrfs_reserve_flush_enum flush);
 119 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 120                                      struct btrfs_space_info *space_info,
 121                                      u64 num_bytes);
 122 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 123                                      struct btrfs_space_info *space_info,
 124                                      u64 num_bytes);
 125
 126 static noinline int
 127 block_group_cache_done(struct btrfs_block_group_cache *cache)
 128 {
 129         smp_mb();
 130         return cache->cached == BTRFS_CACHE_FINISHED ||
 131                 cache->cached == BTRFS_CACHE_ERROR;
 132 }
 133
 134 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 135 {
 136         return (cache->flags & bits) == bits;
 137 }
 138
 139 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 140 {
 141         atomic_inc(&cache->count);
 142 }
 143
 144 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 145 {
 146         if (atomic_dec_and_test(&cache->count)) {
 147                 WARN_ON(cache->pinned > 0);
 148                 WARN_ON(cache->reserved > 0);
 149                 kfree(cache->free_space_ctl);
 150                 kfree(cache);
 151         }
 152 }
 153
 154 /*
 155  * this adds the block group to the fs_info rb tree for the block group
 156  * cache
 157  */
 158 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 159                                 struct btrfs_block_group_cache *block_group)
 160 {
 161         struct rb_node **p;
 162         struct rb_node *parent = NULL;
 163         struct btrfs_block_group_cache *cache;
 164
 165         spin_lock(&info->block_group_cache_lock);
 166         p = &info->block_group_cache_tree.rb_node;
 167
 168         while (*p) {
 169                 parent = *p;
 170                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 171                                  cache_node);
 172                 if (block_group->key.objectid < cache->key.objectid) {
 173                         p = &(*p)->rb_left;
 174                 } else if (block_group->key.objectid > cache->key.objectid) {
 175                         p = &(*p)->rb_right;
 176                 } else {
 177                         spin_unlock(&info->block_group_cache_lock);
 178                         return -EEXIST;
 179                 }
 180         }
 181
 182         rb_link_node(&block_group->cache_node, parent, p);
 183         rb_insert_color(&block_group->cache_node,
 184                         &info->block_group_cache_tree);
 185
 186         if (info->first_logical_byte > block_group->key.objectid)
 187                 info->first_logical_byte = block_group->key.objectid;
 188
 189         spin_unlock(&info->block_group_cache_lock);
 190
 191         return 0;
 192 }
 193
 194 /*
 195  * This will return the block group at or after bytenr if contains is 0, else
 196  * it will return the block group that contains the bytenr
 197  */
 198 static struct btrfs_block_group_cache *
 199 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 200                               int contains)
 201 {
 202         struct btrfs_block_group_cache *cache, *ret = NULL;
 203         struct rb_node *n;
 204         u64 end, start;
 205
 206         spin_lock(&info->block_group_cache_lock);
 207         n = info->block_group_cache_tree.rb_node;
 208
 209         while (n) {
 210                 cache = rb_entry(n, struct btrfs_block_group_cache,
 211                                  cache_node);
 212                 end = cache->key.objectid + cache->key.offset - 1;
 213                 start = cache->key.objectid;
 214
 215                 if (bytenr < start) {
 216                         if (!contains && (!ret || start < ret->key.objectid))
 217                                 ret = cache;
 218                         n = n->rb_left;
 219                 } else if (bytenr > start) {
 220                         if (contains && bytenr <= end) {
 221                                 ret = cache;
 222                                 break;
 223                         }
 224                         n = n->rb_right;
 225                 } else {
 226                         ret = cache;
 227                         break;
 228                 }
 229         }
 230         if (ret) {
 231                 btrfs_get_block_group(ret);
 232                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 233                         info->first_logical_byte = ret->key.objectid;
 234         }
 235         spin_unlock(&info->block_group_cache_lock);
 236
 237         return ret;
 238 }
 239
 240 static int add_excluded_extent(struct btrfs_root *root,
 241                                u64 start, u64 num_bytes)
 242 {
 243         u64 end = start + num_bytes - 1;
 244         set_extent_bits(&root->fs_info->freed_extents[0],
 245                         start, end, EXTENT_UPTODATE);
 246         set_extent_bits(&root->fs_info->freed_extents[1],
 247                         start, end, EXTENT_UPTODATE);
 248         return 0;
 249 }
 250
 251 static void free_excluded_extents(struct btrfs_root *root,
 252                                   struct btrfs_block_group_cache *cache)
 253 {
 254         u64 start, end;
 255
 256         start = cache->key.objectid;
 257         end = start + cache->key.offset - 1;
 258
 259         clear_extent_bits(&root->fs_info->freed_extents[0],
 260                           start, end, EXTENT_UPTODATE);
 261         clear_extent_bits(&root->fs_info->freed_extents[1],
 262                           start, end, EXTENT_UPTODATE);
 263 }
 264
 265 static int exclude_super_stripes(struct btrfs_root *root,
 266                                  struct btrfs_block_group_cache *cache)
 267 {
 268         u64 bytenr;
 269         u64 *logical;
 270         int stripe_len;
 271         int i, nr, ret;
 272
 273         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 274                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 275                 cache->bytes_super += stripe_len;
 276                 ret = add_excluded_extent(root, cache->key.objectid,
 277                                           stripe_len);
 278                 if (ret)
 279                         return ret;
 280         }
 281
 282         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 283                 bytenr = btrfs_sb_offset(i);
 284                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 285                                        cache->key.objectid, bytenr,
 286                                        0, &logical, &nr, &stripe_len);
 287                 if (ret)
 288                         return ret;
 289
 290                 while (nr--) {
 291                         u64 start, len;
 292
 293                         if (logical[nr] > cache->key.objectid +
 294                             cache->key.offset)
 295                                 continue;
 296
 297                         if (logical[nr] + stripe_len <= cache->key.objectid)
 298                                 continue;
 299
 300                         start = logical[nr];
 301                         if (start < cache->key.objectid) {
 302                                 start = cache->key.objectid;
 303                                 len = (logical[nr] + stripe_len) - start;
 304                         } else {
 305                                 len = min_t(u64, stripe_len,
 306                                             cache->key.objectid +
 307                                             cache->key.offset - start);
 308                         }
 309
 310                         cache->bytes_super += len;
 311                         ret = add_excluded_extent(root, start, len);
 312                         if (ret) {
 313                                 kfree(logical);
 314                                 return ret;
 315                         }
 316                 }
 317
 318                 kfree(logical);
 319         }
 320         return 0;
 321 }
 322
 323 static struct btrfs_caching_control *
 324 get_caching_control(struct btrfs_block_group_cache *cache)
 325 {
 326         struct btrfs_caching_control *ctl;
 327
 328         spin_lock(&cache->lock);
 329         if (!cache->caching_ctl) {
 330                 spin_unlock(&cache->lock);
 331                 return NULL;
 332         }
 333
 334         ctl = cache->caching_ctl;
 335         atomic_inc(&ctl->count);
 336         spin_unlock(&cache->lock);
 337         return ctl;
 338 }
 339
 340 static void put_caching_control(struct btrfs_caching_control *ctl)
 341 {
 342         if (atomic_dec_and_test(&ctl->count))
 343                 kfree(ctl);
 344 }
 345
 346 #ifdef CONFIG_BTRFS_DEBUG
 347 static void fragment_free_space(struct btrfs_root *root,
 348                                 struct btrfs_block_group_cache *block_group)
 349 {
 350         u64 start = block_group->key.objectid;
 351         u64 len = block_group->key.offset;
 352         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 353                 root->nodesize : root->sectorsize;
 354         u64 step = chunk << 1;
 355
 356         while (len > chunk) {
 357                 btrfs_remove_free_space(block_group, start, chunk);
 358                 start += step;
 359                 if (len < step)
 360                         len = 0;
 361                 else
 362                         len -= step;
 363         }
 364 }
 365 #endif
 366
 367 /*
 368  * this is only called by cache_block_group, since we could have freed extents
 369  * we need to check the pinned_extents for any extents that can't be used yet
 370  * since their free space will be released as soon as the transaction commits.
 371  */
 372 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 373                        struct btrfs_fs_info *info, u64 start, u64 end)
 374 {
 375         u64 extent_start, extent_end, size, total_added = 0;
 376         int ret;
 377
 378         while (start < end) {
 379                 ret = find_first_extent_bit(info->pinned_extents, start,
 380                                             &extent_start, &extent_end,
 381                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 382                                             NULL);
 383                 if (ret)
 384                         break;
 385
 386                 if (extent_start <= start) {
 387                         start = extent_end + 1;
 388                 } else if (extent_start > start && extent_start < end) {
 389                         size = extent_start - start;
 390                         total_added += size;
 391                         ret = btrfs_add_free_space(block_group, start,
 392                                                    size);
 393                         BUG_ON(ret); /* -ENOMEM or logic error */
 394                         start = extent_end + 1;
 395                 } else {
 396                         break;
 397                 }
 398         }
 399
 400         if (start < end) {
 401                 size = end - start;
 402                 total_added += size;
 403                 ret = btrfs_add_free_space(block_group, start, size);
 404                 BUG_ON(ret); /* -ENOMEM or logic error */
 405         }
 406
 407         return total_added;
 408 }
 409
 410 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 411 {
 412         struct btrfs_block_group_cache *block_group;
 413         struct btrfs_fs_info *fs_info;
 414         struct btrfs_root *extent_root;
 415         struct btrfs_path *path;
 416         struct extent_buffer *leaf;
 417         struct btrfs_key key;
 418         u64 total_found = 0;
 419         u64 last = 0;
 420         u32 nritems;
 421         int ret;
 422         bool wakeup = true;
 423
 424         block_group = caching_ctl->block_group;
 425         fs_info = block_group->fs_info;
 426         extent_root = fs_info->extent_root;
 427
 428         path = btrfs_alloc_path();
 429         if (!path)
 430                 return -ENOMEM;
 431
 432         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 433
 434 #ifdef CONFIG_BTRFS_DEBUG
 435         /*
 436          * If we're fragmenting we don't want to make anybody think we can
 437          * allocate from this block group until we've had a chance to fragment
 438          * the free space.
 439          */
 440         if (btrfs_should_fragment_free_space(extent_root, block_group))
 441                 wakeup = false;
 442 #endif
 443         /*
 444          * We don't want to deadlock with somebody trying to allocate a new
 445          * extent for the extent root while also trying to search the extent
 446          * root to add free space.  So we skip locking and search the commit
 447          * root, since its read-only
 448          */
 449         path->skip_locking = 1;
 450         path->search_commit_root = 1;
 451         path->reada = READA_FORWARD;
 452
 453         key.objectid = last;
 454         key.offset = 0;
 455         key.type = BTRFS_EXTENT_ITEM_KEY;
 456
 457 next:
 458         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 459         if (ret < 0)
 460                 goto out;
 461
 462         leaf = path->nodes[0];
 463         nritems = btrfs_header_nritems(leaf);
 464
 465         while (1) {
 466                 if (btrfs_fs_closing(fs_info) > 1) {
 467                         last = (u64)-1;
 468                         break;
 469                 }
 470
 471                 if (path->slots[0] < nritems) {
 472                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 473                 } else {
 474                         ret = find_next_key(path, 0, &key);
 475                         if (ret)
 476                                 break;
 477
 478                         if (need_resched() ||
 479                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 480                                 if (wakeup)
 481                                         caching_ctl->progress = last;
 482                                 btrfs_release_path(path);
 483                                 up_read(&fs_info->commit_root_sem);
 484                                 mutex_unlock(&caching_ctl->mutex);
 485                                 cond_resched();
 486                                 mutex_lock(&caching_ctl->mutex);
 487                                 down_read(&fs_info->commit_root_sem);
 488                                 goto next;
 489                         }
 490
 491                         ret = btrfs_next_leaf(extent_root, path);
 492                         if (ret < 0)
 493                                 goto out;
 494                         if (ret)
 495                                 break;
 496                         leaf = path->nodes[0];
 497                         nritems = btrfs_header_nritems(leaf);
 498                         continue;
 499                 }
 500
 501                 if (key.objectid < last) {
 502                         key.objectid = last;
 503                         key.offset = 0;
 504                         key.type = BTRFS_EXTENT_ITEM_KEY;
 505
 506                         if (wakeup)
 507                                 caching_ctl->progress = last;
 508                         btrfs_release_path(path);
 509                         goto next;
 510                 }
 511
 512                 if (key.objectid < block_group->key.objectid) {
 513                         path->slots[0]++;
 514                         continue;
 515                 }
 516
 517                 if (key.objectid >= block_group->key.objectid +
 518                     block_group->key.offset)
 519                         break;
 520
 521                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 522                     key.type == BTRFS_METADATA_ITEM_KEY) {
 523                         total_found += add_new_free_space(block_group,
 524                                                           fs_info, last,
 525                                                           key.objectid);
 526                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 527                                 last = key.objectid +
 528                                         fs_info->tree_root->nodesize;
 529                         else
 530                                 last = key.objectid + key.offset;
 531
 532                         if (total_found > CACHING_CTL_WAKE_UP) {
 533                                 total_found = 0;
 534                                 if (wakeup)
 535                                         wake_up(&caching_ctl->wait);
 536                         }
 537                 }
 538                 path->slots[0]++;
 539         }
 540         ret = 0;
 541
 542         total_found += add_new_free_space(block_group, fs_info, last,
 543                                           block_group->key.objectid +
 544                                           block_group->key.offset);
 545         caching_ctl->progress = (u64)-1;
 546
 547 out:
 548         btrfs_free_path(path);
 549         return ret;
 550 }
 551
 552 static noinline void caching_thread(struct btrfs_work *work)
 553 {
 554         struct btrfs_block_group_cache *block_group;
 555         struct btrfs_fs_info *fs_info;
 556         struct btrfs_caching_control *caching_ctl;
 557         struct btrfs_root *extent_root;
 558         int ret;
 559
 560         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 561         block_group = caching_ctl->block_group;
 562         fs_info = block_group->fs_info;
 563         extent_root = fs_info->extent_root;
 564
 565         mutex_lock(&caching_ctl->mutex);
 566         down_read(&fs_info->commit_root_sem);
 567
 568         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 569                 ret = load_free_space_tree(caching_ctl);
 570         else
 571                 ret = load_extent_tree_free(caching_ctl);
 572
 573         spin_lock(&block_group->lock);
 574         block_group->caching_ctl = NULL;
 575         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 576         spin_unlock(&block_group->lock);
 577
 578 #ifdef CONFIG_BTRFS_DEBUG
 579         if (btrfs_should_fragment_free_space(extent_root, block_group)) {
 580                 u64 bytes_used;
 581
 582                 spin_lock(&block_group->space_info->lock);
 583                 spin_lock(&block_group->lock);
 584                 bytes_used = block_group->key.offset -
 585                         btrfs_block_group_used(&block_group->item);
 586                 block_group->space_info->bytes_used += bytes_used >> 1;
 587                 spin_unlock(&block_group->lock);
 588                 spin_unlock(&block_group->space_info->lock);
 589                 fragment_free_space(extent_root, block_group);
 590         }
 591 #endif
 592
 593         caching_ctl->progress = (u64)-1;
 594
 595         up_read(&fs_info->commit_root_sem);
 596         free_excluded_extents(fs_info->extent_root, block_group);
 597         mutex_unlock(&caching_ctl->mutex);
 598
 599         wake_up(&caching_ctl->wait);
 600
 601         put_caching_control(caching_ctl);
 602         btrfs_put_block_group(block_group);
 603 }
 604
 605 static int cache_block_group(struct btrfs_block_group_cache *cache,
 606                              int load_cache_only)
 607 {
 608         DEFINE_WAIT(wait);
 609         struct btrfs_fs_info *fs_info = cache->fs_info;
 610         struct btrfs_caching_control *caching_ctl;
 611         int ret = 0;
 612
 613         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 614         if (!caching_ctl)
 615                 return -ENOMEM;
 616
 617         INIT_LIST_HEAD(&caching_ctl->list);
 618         mutex_init(&caching_ctl->mutex);
 619         init_waitqueue_head(&caching_ctl->wait);
 620         caching_ctl->block_group = cache;
 621         caching_ctl->progress = cache->key.objectid;
 622         atomic_set(&caching_ctl->count, 1);
 623         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 624                         caching_thread, NULL, NULL);
 625
 626         spin_lock(&cache->lock);
 627         /*
 628          * This should be a rare occasion, but this could happen I think in the
 629          * case where one thread starts to load the space cache info, and then
 630          * some other thread starts a transaction commit which tries to do an
 631          * allocation while the other thread is still loading the space cache
 632          * info.  The previous loop should have kept us from choosing this block
 633          * group, but if we've moved to the state where we will wait on caching
 634          * block groups we need to first check if we're doing a fast load here,
 635          * so we can wait for it to finish, otherwise we could end up allocating
 636          * from a block group who's cache gets evicted for one reason or
 637          * another.
 638          */
 639         while (cache->cached == BTRFS_CACHE_FAST) {
 640                 struct btrfs_caching_control *ctl;
 641
 642                 ctl = cache->caching_ctl;
 643                 atomic_inc(&ctl->count);
 644                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 645                 spin_unlock(&cache->lock);
 646
 647                 schedule();
 648
 649                 finish_wait(&ctl->wait, &wait);
 650                 put_caching_control(ctl);
 651                 spin_lock(&cache->lock);
 652         }
 653
 654         if (cache->cached != BTRFS_CACHE_NO) {
 655                 spin_unlock(&cache->lock);
 656                 kfree(caching_ctl);
 657                 return 0;
 658         }
 659         WARN_ON(cache->caching_ctl);
 660         cache->caching_ctl = caching_ctl;
 661         cache->cached = BTRFS_CACHE_FAST;
 662         spin_unlock(&cache->lock);
 663
 664         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 665                 mutex_lock(&caching_ctl->mutex);
 666                 ret = load_free_space_cache(fs_info, cache);
 667
 668                 spin_lock(&cache->lock);
 669                 if (ret == 1) {
 670                         cache->caching_ctl = NULL;
 671                         cache->cached = BTRFS_CACHE_FINISHED;
 672                         cache->last_byte_to_unpin = (u64)-1;
 673                         caching_ctl->progress = (u64)-1;
 674                 } else {
 675                         if (load_cache_only) {
 676                                 cache->caching_ctl = NULL;
 677                                 cache->cached = BTRFS_CACHE_NO;
 678                         } else {
 679                                 cache->cached = BTRFS_CACHE_STARTED;
 680                                 cache->has_caching_ctl = 1;
 681                         }
 682                 }
 683                 spin_unlock(&cache->lock);
 684 #ifdef CONFIG_BTRFS_DEBUG
 685                 if (ret == 1 &&
 686                     btrfs_should_fragment_free_space(fs_info->extent_root,
 687                                                      cache)) {
 688                         u64 bytes_used;
 689
 690                         spin_lock(&cache->space_info->lock);
 691                         spin_lock(&cache->lock);
 692                         bytes_used = cache->key.offset -
 693                                 btrfs_block_group_used(&cache->item);
 694                         cache->space_info->bytes_used += bytes_used >> 1;
 695                         spin_unlock(&cache->lock);
 696                         spin_unlock(&cache->space_info->lock);
 697                         fragment_free_space(fs_info->extent_root, cache);
 698                 }
 699 #endif
 700                 mutex_unlock(&caching_ctl->mutex);
 701
 702                 wake_up(&caching_ctl->wait);
 703                 if (ret == 1) {
 704                         put_caching_control(caching_ctl);
 705                         free_excluded_extents(fs_info->extent_root, cache);
 706                         return 0;
 707                 }
 708         } else {
 709                 /*
 710                  * We're either using the free space tree or no caching at all.
 711                  * Set cached to the appropriate value and wakeup any waiters.
 712                  */
 713                 spin_lock(&cache->lock);
 714                 if (load_cache_only) {
 715                         cache->caching_ctl = NULL;
 716                         cache->cached = BTRFS_CACHE_NO;
 717                 } else {
 718                         cache->cached = BTRFS_CACHE_STARTED;
 719                         cache->has_caching_ctl = 1;
 720                 }
 721                 spin_unlock(&cache->lock);
 722                 wake_up(&caching_ctl->wait);
 723         }
 724
 725         if (load_cache_only) {
 726                 put_caching_control(caching_ctl);
 727                 return 0;
 728         }
 729
 730         down_write(&fs_info->commit_root_sem);
 731         atomic_inc(&caching_ctl->count);
 732         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 733         up_write(&fs_info->commit_root_sem);
 734
 735         btrfs_get_block_group(cache);
 736
 737         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 738
 739         return ret;
 740 }
 741
 742 /*
 743  * return the block group that starts at or after bytenr
 744  */
 745 static struct btrfs_block_group_cache *
 746 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 747 {
 748         struct btrfs_block_group_cache *cache;
 749
 750         cache = block_group_cache_tree_search(info, bytenr, 0);
 751
 752         return cache;
 753 }
 754
 755 /*
 756  * return the block group that contains the given bytenr
 757  */
 758 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 759                                                  struct btrfs_fs_info *info,
 760                                                  u64 bytenr)
 761 {
 762         struct btrfs_block_group_cache *cache;
 763
 764         cache = block_group_cache_tree_search(info, bytenr, 1);
 765
 766         return cache;
 767 }
 768
 769 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 770                                                   u64 flags)
 771 {
 772         struct list_head *head = &info->space_info;
 773         struct btrfs_space_info *found;
 774
 775         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 776
 777         rcu_read_lock();
 778         list_for_each_entry_rcu(found, head, list) {
 779                 if (found->flags & flags) {
 780                         rcu_read_unlock();
 781                         return found;
 782                 }
 783         }
 784         rcu_read_unlock();
 785         return NULL;
 786 }
 787
 788 /*
 789  * after adding space to the filesystem, we need to clear the full flags
 790  * on all the space infos.
 791  */
 792 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 793 {
 794         struct list_head *head = &info->space_info;
 795         struct btrfs_space_info *found;
 796
 797         rcu_read_lock();
 798         list_for_each_entry_rcu(found, head, list)
 799                 found->full = 0;
 800         rcu_read_unlock();
 801 }
 802
 803 /* simple helper to search for an existing data extent at a given offset */
 804 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 805 {
 806         int ret;
 807         struct btrfs_key key;
 808         struct btrfs_path *path;
 809
 810         path = btrfs_alloc_path();
 811         if (!path)
 812                 return -ENOMEM;
 813
 814         key.objectid = start;
 815         key.offset = len;
 816         key.type = BTRFS_EXTENT_ITEM_KEY;
 817         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 818                                 0, 0);
 819         btrfs_free_path(path);
 820         return ret;
 821 }
 822
 823 /*
 824  * helper function to lookup reference count and flags of a tree block.
 825  *
 826  * the head node for delayed ref is used to store the sum of all the
 827  * reference count modifications queued up in the rbtree. the head
 828  * node may also store the extent flags to set. This way you can check
 829  * to see what the reference count and extent flags would be if all of
 830  * the delayed refs are not processed.
 831  */
 832 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 833                              struct btrfs_root *root, u64 bytenr,
 834                              u64 offset, int metadata, u64 *refs, u64 *flags)
 835 {
 836         struct btrfs_delayed_ref_head *head;
 837         struct btrfs_delayed_ref_root *delayed_refs;
 838         struct btrfs_path *path;
 839         struct btrfs_extent_item *ei;
 840         struct extent_buffer *leaf;
 841         struct btrfs_key key;
 842         u32 item_size;
 843         u64 num_refs;
 844         u64 extent_flags;
 845         int ret;
 846
 847         /*
 848          * If we don't have skinny metadata, don't bother doing anything
 849          * different
 850          */
 851         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
 852                 offset = root->nodesize;
 853                 metadata = 0;
 854         }
 855
 856         path = btrfs_alloc_path();
 857         if (!path)
 858                 return -ENOMEM;
 859
 860         if (!trans) {
 861                 path->skip_locking = 1;
 862                 path->search_commit_root = 1;
 863         }
 864
 865 search_again:
 866         key.objectid = bytenr;
 867         key.offset = offset;
 868         if (metadata)
 869                 key.type = BTRFS_METADATA_ITEM_KEY;
 870         else
 871                 key.type = BTRFS_EXTENT_ITEM_KEY;
 872
 873         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 874                                 &key, path, 0, 0);
 875         if (ret < 0)
 876                 goto out_free;
 877
 878         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 879                 if (path->slots[0]) {
 880                         path->slots[0]--;
 881                         btrfs_item_key_to_cpu(path->nodes[0], &key,
 882                                               path->slots[0]);
 883                         if (key.objectid == bytenr &&
 884                             key.type == BTRFS_EXTENT_ITEM_KEY &&
 885                             key.offset == root->nodesize)
 886                                 ret = 0;
 887                 }
 888         }
 889
 890         if (ret == 0) {
 891                 leaf = path->nodes[0];
 892                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 893                 if (item_size >= sizeof(*ei)) {
 894                         ei = btrfs_item_ptr(leaf, path->slots[0],
 895                                             struct btrfs_extent_item);
 896                         num_refs = btrfs_extent_refs(leaf, ei);
 897                         extent_flags = btrfs_extent_flags(leaf, ei);
 898                 } else {
 899 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 900                         struct btrfs_extent_item_v0 *ei0;
 901                         BUG_ON(item_size != sizeof(*ei0));
 902                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 903                                              struct btrfs_extent_item_v0);
 904                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
 905                         /* FIXME: this isn't correct for data */
 906                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 907 #else
 908                         BUG();
 909 #endif
 910                 }
 911                 BUG_ON(num_refs == 0);
 912         } else {
 913                 num_refs = 0;
 914                 extent_flags = 0;
 915                 ret = 0;
 916         }
 917
 918         if (!trans)
 919                 goto out;
 920
 921         delayed_refs = &trans->transaction->delayed_refs;
 922         spin_lock(&delayed_refs->lock);
 923         head = btrfs_find_delayed_ref_head(trans, bytenr);
 924         if (head) {
 925                 if (!mutex_trylock(&head->mutex)) {
 926                         atomic_inc(&head->node.refs);
 927                         spin_unlock(&delayed_refs->lock);
 928
 929                         btrfs_release_path(path);
 930
 931                         /*
 932                          * Mutex was contended, block until it's released and try
 933                          * again
 934                          */
 935                         mutex_lock(&head->mutex);
 936                         mutex_unlock(&head->mutex);
 937                         btrfs_put_delayed_ref(&head->node);
 938                         goto search_again;
 939                 }
 940                 spin_lock(&head->lock);
 941                 if (head->extent_op && head->extent_op->update_flags)
 942                         extent_flags |= head->extent_op->flags_to_set;
 943                 else
 944                         BUG_ON(num_refs == 0);
 945
 946                 num_refs += head->node.ref_mod;
 947                 spin_unlock(&head->lock);
 948                 mutex_unlock(&head->mutex);
 949         }
 950         spin_unlock(&delayed_refs->lock);
 951 out:
 952         WARN_ON(num_refs == 0);
 953         if (refs)
 954                 *refs = num_refs;
 955         if (flags)
 956                 *flags = extent_flags;
 957 out_free:
 958         btrfs_free_path(path);
 959         return ret;
 960 }
 961
 962 /*
 963  * Back reference rules.  Back refs have three main goals:
 964  *
 965  * 1) differentiate between all holders of references to an extent so that
 966  *    when a reference is dropped we can make sure it was a valid reference
 967  *    before freeing the extent.
 968  *
 969  * 2) Provide enough information to quickly find the holders of an extent
 970  *    if we notice a given block is corrupted or bad.
 971  *
 972  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 973  *    maintenance.  This is actually the same as #2, but with a slightly
 974  *    different use case.
 975  *
 976  * There are two kinds of back refs. The implicit back refs is optimized
 977  * for pointers in non-shared tree blocks. For a given pointer in a block,
 978  * back refs of this kind provide information about the block's owner tree
 979  * and the pointer's key. These information allow us to find the block by
 980  * b-tree searching. The full back refs is for pointers in tree blocks not
 981  * referenced by their owner trees. The location of tree block is recorded
 982  * in the back refs. Actually the full back refs is generic, and can be
 983  * used in all cases the implicit back refs is used. The major shortcoming
 984  * of the full back refs is its overhead. Every time a tree block gets
 985  * COWed, we have to update back refs entry for all pointers in it.
 986  *
 987  * For a newly allocated tree block, we use implicit back refs for
 988  * pointers in it. This means most tree related operations only involve
 989  * implicit back refs. For a tree block created in old transaction, the
 990  * only way to drop a reference to it is COW it. So we can detect the
 991  * event that tree block loses its owner tree's reference and do the
 992  * back refs conversion.
 993  *
 994  * When a tree block is COWed through a tree, there are four cases:
 995  *
 996  * The reference count of the block is one and the tree is the block's
 997  * owner tree. Nothing to do in this case.
 998  *
 999  * The reference count of the block is one and the tree is not the
1000  * block's owner tree. In this case, full back refs is used for pointers
1001  * in the block. Remove these full back refs, add implicit back refs for
1002  * every pointers in the new block.
1003  *
1004  * The reference count of the block is greater than one and the tree is
1005  * the block's owner tree. In this case, implicit back refs is used for
1006  * pointers in the block. Add full back refs for every pointers in the
1007  * block, increase lower level extents' reference counts. The original
1008  * implicit back refs are entailed to the new block.
1009  *
1010  * The reference count of the block is greater than one and the tree is
1011  * not the block's owner tree. Add implicit back refs for every pointer in
1012  * the new block, increase lower level extents' reference count.
1013  *
1014  * Back Reference Key composing:
1015  *
1016  * The key objectid corresponds to the first byte in the extent,
1017  * The key type is used to differentiate between types of back refs.
1018  * There are different meanings of the key offset for different types
1019  * of back refs.
1020  *
1021  * File extents can be referenced by:
1022  *
1023  * - multiple snapshots, subvolumes, or different generations in one subvol
1024  * - different files inside a single subvolume
1025  * - different offsets inside a file (bookend extents in file.c)
1026  *
1027  * The extent ref structure for the implicit back refs has fields for:
1028  *
1029  * - Objectid of the subvolume root
1030  * - objectid of the file holding the reference
1031  * - original offset in the file
1032  * - how many bookend extents
1033  *
1034  * The key offset for the implicit back refs is hash of the first
1035  * three fields.
1036  *
1037  * The extent ref structure for the full back refs has field for:
1038  *
1039  * - number of pointers in the tree leaf
1040  *
1041  * The key offset for the implicit back refs is the first byte of
1042  * the tree leaf
1043  *
1044  * When a file extent is allocated, The implicit back refs is used.
1045  * the fields are filled in:
1046  *
1047  *     (root_key.objectid, inode objectid, offset in file, 1)
1048  *
1049  * When a file extent is removed file truncation, we find the
1050  * corresponding implicit back refs and check the following fields:
1051  *
1052  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1053  *
1054  * Btree extents can be referenced by:
1055  *
1056  * - Different subvolumes
1057  *
1058  * Both the implicit back refs and the full back refs for tree blocks
1059  * only consist of key. The key offset for the implicit back refs is
1060  * objectid of block's owner tree. The key offset for the full back refs
1061  * is the first byte of parent block.
1062  *
1063  * When implicit back refs is used, information about the lowest key and
1064  * level of the tree block are required. These information are stored in
1065  * tree block info structure.
1066  */
1067
1068 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1069 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1070                                   struct btrfs_root *root,
1071                                   struct btrfs_path *path,
1072                                   u64 owner, u32 extra_size)
1073 {
1074         struct btrfs_extent_item *item;
1075         struct btrfs_extent_item_v0 *ei0;
1076         struct btrfs_extent_ref_v0 *ref0;
1077         struct btrfs_tree_block_info *bi;
1078         struct extent_buffer *leaf;
1079         struct btrfs_key key;
1080         struct btrfs_key found_key;
1081         u32 new_size = sizeof(*item);
1082         u64 refs;
1083         int ret;
1084
1085         leaf = path->nodes[0];
1086         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1087
1088         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1089         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1090                              struct btrfs_extent_item_v0);
1091         refs = btrfs_extent_refs_v0(leaf, ei0);
1092
1093         if (owner == (u64)-1) {
1094                 while (1) {
1095                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1096                                 ret = btrfs_next_leaf(root, path);
1097                                 if (ret < 0)
1098                                         return ret;
1099                                 BUG_ON(ret > 0); /* Corruption */
1100                                 leaf = path->nodes[0];
1101                         }
1102                         btrfs_item_key_to_cpu(leaf, &found_key,
1103                                               path->slots[0]);
1104                         BUG_ON(key.objectid != found_key.objectid);
1105                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1106                                 path->slots[0]++;
1107                                 continue;
1108                         }
1109                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1110                                               struct btrfs_extent_ref_v0);
1111                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1112                         break;
1113                 }
1114         }
1115         btrfs_release_path(path);
1116
1117         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1118                 new_size += sizeof(*bi);
1119
1120         new_size -= sizeof(*ei0);
1121         ret = btrfs_search_slot(trans, root, &key, path,
1122                                 new_size + extra_size, 1);
1123         if (ret < 0)
1124                 return ret;
1125         BUG_ON(ret); /* Corruption */
1126
1127         btrfs_extend_item(root, path, new_size);
1128
1129         leaf = path->nodes[0];
1130         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1131         btrfs_set_extent_refs(leaf, item, refs);
1132         /* FIXME: get real generation */
1133         btrfs_set_extent_generation(leaf, item, 0);
1134         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1135                 btrfs_set_extent_flags(leaf, item,
1136                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1137                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1138                 bi = (struct btrfs_tree_block_info *)(item + 1);
1139                 /* FIXME: get first key of the block */
1140                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1141                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1142         } else {
1143                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1144         }
1145         btrfs_mark_buffer_dirty(leaf);
1146         return 0;
1147 }
1148 #endif
1149
1150 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1151 {
1152         u32 high_crc = ~(u32)0;
1153         u32 low_crc = ~(u32)0;
1154         __le64 lenum;
1155
1156         lenum = cpu_to_le64(root_objectid);
1157         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1158         lenum = cpu_to_le64(owner);
1159         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1160         lenum = cpu_to_le64(offset);
1161         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1162
1163         return ((u64)high_crc << 31) ^ (u64)low_crc;
1164 }
1165
1166 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1167                                      struct btrfs_extent_data_ref *ref)
1168 {
1169         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1170                                     btrfs_extent_data_ref_objectid(leaf, ref),
1171                                     btrfs_extent_data_ref_offset(leaf, ref));
1172 }
1173
1174 static int match_extent_data_ref(struct extent_buffer *leaf,
1175                                  struct btrfs_extent_data_ref *ref,
1176                                  u64 root_objectid, u64 owner, u64 offset)
1177 {
1178         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1179             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1180             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1181                 return 0;
1182         return 1;
1183 }
1184
1185 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1186                                            struct btrfs_root *root,
1187                                            struct btrfs_path *path,
1188                                            u64 bytenr, u64 parent,
1189                                            u64 root_objectid,
1190                                            u64 owner, u64 offset)
1191 {
1192         struct btrfs_key key;
1193         struct btrfs_extent_data_ref *ref;
1194         struct extent_buffer *leaf;
1195         u32 nritems;
1196         int ret;
1197         int recow;
1198         int err = -ENOENT;
1199
1200         key.objectid = bytenr;
1201         if (parent) {
1202                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1203                 key.offset = parent;
1204         } else {
1205                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1206                 key.offset = hash_extent_data_ref(root_objectid,
1207                                                   owner, offset);
1208         }
1209 again:
1210         recow = 0;
1211         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1212         if (ret < 0) {
1213                 err = ret;
1214                 goto fail;
1215         }
1216
1217         if (parent) {
1218                 if (!ret)
1219                         return 0;
1220 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1221                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1222                 btrfs_release_path(path);
1223                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1224                 if (ret < 0) {
1225                         err = ret;
1226                         goto fail;
1227                 }
1228                 if (!ret)
1229                         return 0;
1230 #endif
1231                 goto fail;
1232         }
1233
1234         leaf = path->nodes[0];
1235         nritems = btrfs_header_nritems(leaf);
1236         while (1) {
1237                 if (path->slots[0] >= nritems) {
1238                         ret = btrfs_next_leaf(root, path);
1239                         if (ret < 0)
1240                                 err = ret;
1241                         if (ret)
1242                                 goto fail;
1243
1244                         leaf = path->nodes[0];
1245                         nritems = btrfs_header_nritems(leaf);
1246                         recow = 1;
1247                 }
1248
1249                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1250                 if (key.objectid != bytenr ||
1251                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1252                         goto fail;
1253
1254                 ref = btrfs_item_ptr(leaf, path->slots[0],
1255                                      struct btrfs_extent_data_ref);
1256
1257                 if (match_extent_data_ref(leaf, ref, root_objectid,
1258                                           owner, offset)) {
1259                         if (recow) {
1260                                 btrfs_release_path(path);
1261                                 goto again;
1262                         }
1263                         err = 0;
1264                         break;
1265                 }
1266                 path->slots[0]++;
1267         }
1268 fail:
1269         return err;
1270 }
1271
1272 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1273                                            struct btrfs_root *root,
1274                                            struct btrfs_path *path,
1275                                            u64 bytenr, u64 parent,
1276                                            u64 root_objectid, u64 owner,
1277                                            u64 offset, int refs_to_add)
1278 {
1279         struct btrfs_key key;
1280         struct extent_buffer *leaf;
1281         u32 size;
1282         u32 num_refs;
1283         int ret;
1284
1285         key.objectid = bytenr;
1286         if (parent) {
1287                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1288                 key.offset = parent;
1289                 size = sizeof(struct btrfs_shared_data_ref);
1290         } else {
1291                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1292                 key.offset = hash_extent_data_ref(root_objectid,
1293                                                   owner, offset);
1294                 size = sizeof(struct btrfs_extent_data_ref);
1295         }
1296
1297         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1298         if (ret && ret != -EEXIST)
1299                 goto fail;
1300
1301         leaf = path->nodes[0];
1302         if (parent) {
1303                 struct btrfs_shared_data_ref *ref;
1304                 ref = btrfs_item_ptr(leaf, path->slots[0],
1305                                      struct btrfs_shared_data_ref);
1306                 if (ret == 0) {
1307                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1308                 } else {
1309                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1310                         num_refs += refs_to_add;
1311                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1312                 }
1313         } else {
1314                 struct btrfs_extent_data_ref *ref;
1315                 while (ret == -EEXIST) {
1316                         ref = btrfs_item_ptr(leaf, path->slots[0],
1317                                              struct btrfs_extent_data_ref);
1318                         if (match_extent_data_ref(leaf, ref, root_objectid,
1319                                                   owner, offset))
1320                                 break;
1321                         btrfs_release_path(path);
1322                         key.offset++;
1323                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1324                                                       size);
1325                         if (ret && ret != -EEXIST)
1326                                 goto fail;
1327
1328                         leaf = path->nodes[0];
1329                 }
1330                 ref = btrfs_item_ptr(leaf, path->slots[0],
1331                                      struct btrfs_extent_data_ref);
1332                 if (ret == 0) {
1333                         btrfs_set_extent_data_ref_root(leaf, ref,
1334                                                        root_objectid);
1335                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1336                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1337                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1338                 } else {
1339                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1340                         num_refs += refs_to_add;
1341                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1342                 }
1343         }
1344         btrfs_mark_buffer_dirty(leaf);
1345         ret = 0;
1346 fail:
1347         btrfs_release_path(path);
1348         return ret;
1349 }
1350
1351 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1352                                            struct btrfs_root *root,
1353                                            struct btrfs_path *path,
1354                                            int refs_to_drop, int *last_ref)
1355 {
1356         struct btrfs_key key;
1357         struct btrfs_extent_data_ref *ref1 = NULL;
1358         struct btrfs_shared_data_ref *ref2 = NULL;
1359         struct extent_buffer *leaf;
1360         u32 num_refs = 0;
1361         int ret = 0;
1362
1363         leaf = path->nodes[0];
1364         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1365
1366         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1367                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1368                                       struct btrfs_extent_data_ref);
1369                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1370         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1371                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1372                                       struct btrfs_shared_data_ref);
1373                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1374 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1375         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1376                 struct btrfs_extent_ref_v0 *ref0;
1377                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1378                                       struct btrfs_extent_ref_v0);
1379                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1380 #endif
1381         } else {
1382                 BUG();
1383         }
1384
1385         BUG_ON(num_refs < refs_to_drop);
1386         num_refs -= refs_to_drop;
1387
1388         if (num_refs == 0) {
1389                 ret = btrfs_del_item(trans, root, path);
1390                 *last_ref = 1;
1391         } else {
1392                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1393                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1394                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1395                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1396 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1397                 else {
1398                         struct btrfs_extent_ref_v0 *ref0;
1399                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1400                                         struct btrfs_extent_ref_v0);
1401                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1402                 }
1403 #endif
1404                 btrfs_mark_buffer_dirty(leaf);
1405         }
1406         return ret;
1407 }
1408
1409 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1410                                           struct btrfs_extent_inline_ref *iref)
1411 {
1412         struct btrfs_key key;
1413         struct extent_buffer *leaf;
1414         struct btrfs_extent_data_ref *ref1;
1415         struct btrfs_shared_data_ref *ref2;
1416         u32 num_refs = 0;
1417
1418         leaf = path->nodes[0];
1419         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1420         if (iref) {
1421                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1422                     BTRFS_EXTENT_DATA_REF_KEY) {
1423                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1424                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1425                 } else {
1426                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1427                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1428                 }
1429         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1430                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1431                                       struct btrfs_extent_data_ref);
1432                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1433         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1434                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1435                                       struct btrfs_shared_data_ref);
1436                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1437 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1438         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1439                 struct btrfs_extent_ref_v0 *ref0;
1440                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1441                                       struct btrfs_extent_ref_v0);
1442                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1443 #endif
1444         } else {
1445                 WARN_ON(1);
1446         }
1447         return num_refs;
1448 }
1449
1450 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1451                                           struct btrfs_root *root,
1452                                           struct btrfs_path *path,
1453                                           u64 bytenr, u64 parent,
1454                                           u64 root_objectid)
1455 {
1456         struct btrfs_key key;
1457         int ret;
1458
1459         key.objectid = bytenr;
1460         if (parent) {
1461                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1462                 key.offset = parent;
1463         } else {
1464                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1465                 key.offset = root_objectid;
1466         }
1467
1468         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1469         if (ret > 0)
1470                 ret = -ENOENT;
1471 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1472         if (ret == -ENOENT && parent) {
1473                 btrfs_release_path(path);
1474                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1475                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1476                 if (ret > 0)
1477                         ret = -ENOENT;
1478         }
1479 #endif
1480         return ret;
1481 }
1482
1483 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1484                                           struct btrfs_root *root,
1485                                           struct btrfs_path *path,
1486                                           u64 bytenr, u64 parent,
1487                                           u64 root_objectid)
1488 {
1489         struct btrfs_key key;
1490         int ret;
1491
1492         key.objectid = bytenr;
1493         if (parent) {
1494                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1495                 key.offset = parent;
1496         } else {
1497                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1498                 key.offset = root_objectid;
1499         }
1500
1501         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1502         btrfs_release_path(path);
1503         return ret;
1504 }
1505
1506 static inline int extent_ref_type(u64 parent, u64 owner)
1507 {
1508         int type;
1509         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1510                 if (parent > 0)
1511                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1512                 else
1513                         type = BTRFS_TREE_BLOCK_REF_KEY;
1514         } else {
1515                 if (parent > 0)
1516                         type = BTRFS_SHARED_DATA_REF_KEY;
1517                 else
1518                         type = BTRFS_EXTENT_DATA_REF_KEY;
1519         }
1520         return type;
1521 }
1522
1523 static int find_next_key(struct btrfs_path *path, int level,
1524                          struct btrfs_key *key)
1525
1526 {
1527         for (; level < BTRFS_MAX_LEVEL; level++) {
1528                 if (!path->nodes[level])
1529                         break;
1530                 if (path->slots[level] + 1 >=
1531                     btrfs_header_nritems(path->nodes[level]))
1532                         continue;
1533                 if (level == 0)
1534                         btrfs_item_key_to_cpu(path->nodes[level], key,
1535                                               path->slots[level] + 1);
1536                 else
1537                         btrfs_node_key_to_cpu(path->nodes[level], key,
1538                                               path->slots[level] + 1);
1539                 return 0;
1540         }
1541         return 1;
1542 }
1543
1544 /*
1545  * look for inline back ref. if back ref is found, *ref_ret is set
1546  * to the address of inline back ref, and 0 is returned.
1547  *
1548  * if back ref isn't found, *ref_ret is set to the address where it
1549  * should be inserted, and -ENOENT is returned.
1550  *
1551  * if insert is true and there are too many inline back refs, the path
1552  * points to the extent item, and -EAGAIN is returned.
1553  *
1554  * NOTE: inline back refs are ordered in the same way that back ref
1555  *       items in the tree are ordered.
1556  */
1557 static noinline_for_stack
1558 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1559                                  struct btrfs_root *root,
1560                                  struct btrfs_path *path,
1561                                  struct btrfs_extent_inline_ref **ref_ret,
1562                                  u64 bytenr, u64 num_bytes,
1563                                  u64 parent, u64 root_objectid,
1564                                  u64 owner, u64 offset, int insert)
1565 {
1566         struct btrfs_key key;
1567         struct extent_buffer *leaf;
1568         struct btrfs_extent_item *ei;
1569         struct btrfs_extent_inline_ref *iref;
1570         u64 flags;
1571         u64 item_size;
1572         unsigned long ptr;
1573         unsigned long end;
1574         int extra_size;
1575         int type;
1576         int want;
1577         int ret;
1578         int err = 0;
1579         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1580                                                  SKINNY_METADATA);
1581
1582         key.objectid = bytenr;
1583         key.type = BTRFS_EXTENT_ITEM_KEY;
1584         key.offset = num_bytes;
1585
1586         want = extent_ref_type(parent, owner);
1587         if (insert) {
1588                 extra_size = btrfs_extent_inline_ref_size(want);
1589                 path->keep_locks = 1;
1590         } else
1591                 extra_size = -1;
1592
1593         /*
1594          * Owner is our parent level, so we can just add one to get the level
1595          * for the block we are interested in.
1596          */
1597         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1598                 key.type = BTRFS_METADATA_ITEM_KEY;
1599                 key.offset = owner;
1600         }
1601
1602 again:
1603         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1604         if (ret < 0) {
1605                 err = ret;
1606                 goto out;
1607         }
1608
1609         /*
1610          * We may be a newly converted file system which still has the old fat
1611          * extent entries for metadata, so try and see if we have one of those.
1612          */
1613         if (ret > 0 && skinny_metadata) {
1614                 skinny_metadata = false;
1615                 if (path->slots[0]) {
1616                         path->slots[0]--;
1617                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1618                                               path->slots[0]);
1619                         if (key.objectid == bytenr &&
1620                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1621                             key.offset == num_bytes)
1622                                 ret = 0;
1623                 }
1624                 if (ret) {
1625                         key.objectid = bytenr;
1626                         key.type = BTRFS_EXTENT_ITEM_KEY;
1627                         key.offset = num_bytes;
1628                         btrfs_release_path(path);
1629                         goto again;
1630                 }
1631         }
1632
1633         if (ret && !insert) {
1634                 err = -ENOENT;
1635                 goto out;
1636         } else if (WARN_ON(ret)) {
1637                 err = -EIO;
1638                 goto out;
1639         }
1640
1641         leaf = path->nodes[0];
1642         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1643 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1644         if (item_size < sizeof(*ei)) {
1645                 if (!insert) {
1646                         err = -ENOENT;
1647                         goto out;
1648                 }
1649                 ret = convert_extent_item_v0(trans, root, path, owner,
1650                                              extra_size);
1651                 if (ret < 0) {
1652                         err = ret;
1653                         goto out;
1654                 }
1655                 leaf = path->nodes[0];
1656                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1657         }
1658 #endif
1659         BUG_ON(item_size < sizeof(*ei));
1660
1661         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1662         flags = btrfs_extent_flags(leaf, ei);
1663
1664         ptr = (unsigned long)(ei + 1);
1665         end = (unsigned long)ei + item_size;
1666
1667         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1668                 ptr += sizeof(struct btrfs_tree_block_info);
1669                 BUG_ON(ptr > end);
1670         }
1671
1672         err = -ENOENT;
1673         while (1) {
1674                 if (ptr >= end) {
1675                         WARN_ON(ptr > end);
1676                         break;
1677                 }
1678                 iref = (struct btrfs_extent_inline_ref *)ptr;
1679                 type = btrfs_extent_inline_ref_type(leaf, iref);
1680                 if (want < type)
1681                         break;
1682                 if (want > type) {
1683                         ptr += btrfs_extent_inline_ref_size(type);
1684                         continue;
1685                 }
1686
1687                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1688                         struct btrfs_extent_data_ref *dref;
1689                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1690                         if (match_extent_data_ref(leaf, dref, root_objectid,
1691                                                   owner, offset)) {
1692                                 err = 0;
1693                                 break;
1694                         }
1695                         if (hash_extent_data_ref_item(leaf, dref) <
1696                             hash_extent_data_ref(root_objectid, owner, offset))
1697                                 break;
1698                 } else {
1699                         u64 ref_offset;
1700                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1701                         if (parent > 0) {
1702                                 if (parent == ref_offset) {
1703                                         err = 0;
1704                                         break;
1705                                 }
1706                                 if (ref_offset < parent)
1707                                         break;
1708                         } else {
1709                                 if (root_objectid == ref_offset) {
1710                                         err = 0;
1711                                         break;
1712                                 }
1713                                 if (ref_offset < root_objectid)
1714                                         break;
1715                         }
1716                 }
1717                 ptr += btrfs_extent_inline_ref_size(type);
1718         }
1719         if (err == -ENOENT && insert) {
1720                 if (item_size + extra_size >=
1721                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1722                         err = -EAGAIN;
1723                         goto out;
1724                 }
1725                 /*
1726                  * To add new inline back ref, we have to make sure
1727                  * there is no corresponding back ref item.
1728                  * For simplicity, we just do not add new inline back
1729                  * ref if there is any kind of item for this block
1730                  */
1731                 if (find_next_key(path, 0, &key) == 0 &&
1732                     key.objectid == bytenr &&
1733                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1734                         err = -EAGAIN;
1735                         goto out;
1736                 }
1737         }
1738         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1739 out:
1740         if (insert) {
1741                 path->keep_locks = 0;
1742                 btrfs_unlock_up_safe(path, 1);
1743         }
1744         return err;
1745 }
1746
1747 /*
1748  * helper to add new inline back ref
1749  */
1750 static noinline_for_stack
1751 void setup_inline_extent_backref(struct btrfs_root *root,
1752                                  struct btrfs_path *path,
1753                                  struct btrfs_extent_inline_ref *iref,
1754                                  u64 parent, u64 root_objectid,
1755                                  u64 owner, u64 offset, int refs_to_add,
1756                                  struct btrfs_delayed_extent_op *extent_op)
1757 {
1758         struct extent_buffer *leaf;
1759         struct btrfs_extent_item *ei;
1760         unsigned long ptr;
1761         unsigned long end;
1762         unsigned long item_offset;
1763         u64 refs;
1764         int size;
1765         int type;
1766
1767         leaf = path->nodes[0];
1768         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1769         item_offset = (unsigned long)iref - (unsigned long)ei;
1770
1771         type = extent_ref_type(parent, owner);
1772         size = btrfs_extent_inline_ref_size(type);
1773
1774         btrfs_extend_item(root, path, size);
1775
1776         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1777         refs = btrfs_extent_refs(leaf, ei);
1778         refs += refs_to_add;
1779         btrfs_set_extent_refs(leaf, ei, refs);
1780         if (extent_op)
1781                 __run_delayed_extent_op(extent_op, leaf, ei);
1782
1783         ptr = (unsigned long)ei + item_offset;
1784         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1785         if (ptr < end - size)
1786                 memmove_extent_buffer(leaf, ptr + size, ptr,
1787                                       end - size - ptr);
1788
1789         iref = (struct btrfs_extent_inline_ref *)ptr;
1790         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1791         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1792                 struct btrfs_extent_data_ref *dref;
1793                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1794                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1795                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1796                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1797                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1798         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1799                 struct btrfs_shared_data_ref *sref;
1800                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1801                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1802                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1803         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1804                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1805         } else {
1806                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1807         }
1808         btrfs_mark_buffer_dirty(leaf);
1809 }
1810
1811 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1812                                  struct btrfs_root *root,
1813                                  struct btrfs_path *path,
1814                                  struct btrfs_extent_inline_ref **ref_ret,
1815                                  u64 bytenr, u64 num_bytes, u64 parent,
1816                                  u64 root_objectid, u64 owner, u64 offset)
1817 {
1818         int ret;
1819
1820         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1821                                            bytenr, num_bytes, parent,
1822                                            root_objectid, owner, offset, 0);
1823         if (ret != -ENOENT)
1824                 return ret;
1825
1826         btrfs_release_path(path);
1827         *ref_ret = NULL;
1828
1829         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1830                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1831                                             root_objectid);
1832         } else {
1833                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1834                                              root_objectid, owner, offset);
1835         }
1836         return ret;
1837 }
1838
1839 /*
1840  * helper to update/remove inline back ref
1841  */
1842 static noinline_for_stack
1843 void update_inline_extent_backref(struct btrfs_root *root,
1844                                   struct btrfs_path *path,
1845                                   struct btrfs_extent_inline_ref *iref,
1846                                   int refs_to_mod,
1847                                   struct btrfs_delayed_extent_op *extent_op,
1848                                   int *last_ref)
1849 {
1850         struct extent_buffer *leaf;
1851         struct btrfs_extent_item *ei;
1852         struct btrfs_extent_data_ref *dref = NULL;
1853         struct btrfs_shared_data_ref *sref = NULL;
1854         unsigned long ptr;
1855         unsigned long end;
1856         u32 item_size;
1857         int size;
1858         int type;
1859         u64 refs;
1860
1861         leaf = path->nodes[0];
1862         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1863         refs = btrfs_extent_refs(leaf, ei);
1864         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1865         refs += refs_to_mod;
1866         btrfs_set_extent_refs(leaf, ei, refs);
1867         if (extent_op)
1868                 __run_delayed_extent_op(extent_op, leaf, ei);
1869
1870         type = btrfs_extent_inline_ref_type(leaf, iref);
1871
1872         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1873                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1874                 refs = btrfs_extent_data_ref_count(leaf, dref);
1875         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1876                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1877                 refs = btrfs_shared_data_ref_count(leaf, sref);
1878         } else {
1879                 refs = 1;
1880                 BUG_ON(refs_to_mod != -1);
1881         }
1882
1883         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1884         refs += refs_to_mod;
1885
1886         if (refs > 0) {
1887                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1888                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1889                 else
1890                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1891         } else {
1892                 *last_ref = 1;
1893                 size =  btrfs_extent_inline_ref_size(type);
1894                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1895                 ptr = (unsigned long)iref;
1896                 end = (unsigned long)ei + item_size;
1897                 if (ptr + size < end)
1898                         memmove_extent_buffer(leaf, ptr, ptr + size,
1899                                               end - ptr - size);
1900                 item_size -= size;
1901                 btrfs_truncate_item(root, path, item_size, 1);
1902         }
1903         btrfs_mark_buffer_dirty(leaf);
1904 }
1905
1906 static noinline_for_stack
1907 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1908                                  struct btrfs_root *root,
1909                                  struct btrfs_path *path,
1910                                  u64 bytenr, u64 num_bytes, u64 parent,
1911                                  u64 root_objectid, u64 owner,
1912                                  u64 offset, int refs_to_add,
1913                                  struct btrfs_delayed_extent_op *extent_op)
1914 {
1915         struct btrfs_extent_inline_ref *iref;
1916         int ret;
1917
1918         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1919                                            bytenr, num_bytes, parent,
1920                                            root_objectid, owner, offset, 1);
1921         if (ret == 0) {
1922                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1923                 update_inline_extent_backref(root, path, iref,
1924                                              refs_to_add, extent_op, NULL);
1925         } else if (ret == -ENOENT) {
1926                 setup_inline_extent_backref(root, path, iref, parent,
1927                                             root_objectid, owner, offset,
1928                                             refs_to_add, extent_op);
1929                 ret = 0;
1930         }
1931         return ret;
1932 }
1933
1934 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1935                                  struct btrfs_root *root,
1936                                  struct btrfs_path *path,
1937                                  u64 bytenr, u64 parent, u64 root_objectid,
1938                                  u64 owner, u64 offset, int refs_to_add)
1939 {
1940         int ret;
1941         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1942                 BUG_ON(refs_to_add != 1);
1943                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1944                                             parent, root_objectid);
1945         } else {
1946                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1947                                              parent, root_objectid,
1948                                              owner, offset, refs_to_add);
1949         }
1950         return ret;
1951 }
1952
1953 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1954                                  struct btrfs_root *root,
1955                                  struct btrfs_path *path,
1956                                  struct btrfs_extent_inline_ref *iref,
1957                                  int refs_to_drop, int is_data, int *last_ref)
1958 {
1959         int ret = 0;
1960
1961         BUG_ON(!is_data && refs_to_drop != 1);
1962         if (iref) {
1963                 update_inline_extent_backref(root, path, iref,
1964                                              -refs_to_drop, NULL, last_ref);
1965         } else if (is_data) {
1966                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1967                                              last_ref);
1968         } else {
1969                 *last_ref = 1;
1970                 ret = btrfs_del_item(trans, root, path);
1971         }
1972         return ret;
1973 }
1974
1975 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1976 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1977                                u64 *discarded_bytes)
1978 {
1979         int j, ret = 0;
1980         u64 bytes_left, end;
1981         u64 aligned_start = ALIGN(start, 1 << 9);
1982
1983         if (WARN_ON(start != aligned_start)) {
1984                 len -= aligned_start - start;
1985                 len = round_down(len, 1 << 9);
1986                 start = aligned_start;
1987         }
1988
1989         *discarded_bytes = 0;
1990
1991         if (!len)
1992                 return 0;
1993
1994         end = start + len;
1995         bytes_left = len;
1996
1997         /* Skip any superblocks on this device. */
1998         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1999                 u64 sb_start = btrfs_sb_offset(j);
2000                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
2001                 u64 size = sb_start - start;
2002
2003                 if (!in_range(sb_start, start, bytes_left) &&
2004                     !in_range(sb_end, start, bytes_left) &&
2005                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
2006                         continue;
2007
2008                 /*
2009                  * Superblock spans beginning of range.  Adjust start and
2010                  * try again.
2011                  */
2012                 if (sb_start <= start) {
2013                         start += sb_end - start;
2014                         if (start > end) {
2015                                 bytes_left = 0;
2016                                 break;
2017                         }
2018                         bytes_left = end - start;
2019                         continue;
2020                 }
2021
2022                 if (size) {
2023                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2024                                                    GFP_NOFS, 0);
2025                         if (!ret)
2026                                 *discarded_bytes += size;
2027                         else if (ret != -EOPNOTSUPP)
2028                                 return ret;
2029                 }
2030
2031                 start = sb_end;
2032                 if (start > end) {
2033                         bytes_left = 0;
2034                         break;
2035                 }
2036                 bytes_left = end - start;
2037         }
2038
2039         if (bytes_left) {
2040                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2041                                            GFP_NOFS, 0);
2042                 if (!ret)
2043                         *discarded_bytes += bytes_left;
2044         }
2045         return ret;
2046 }
2047
2048 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2049                          u64 num_bytes, u64 *actual_bytes)
2050 {
2051         int ret;
2052         u64 discarded_bytes = 0;
2053         struct btrfs_bio *bbio = NULL;
2054
2055
2056         /*
2057          * Avoid races with device replace and make sure our bbio has devices
2058          * associated to its stripes that don't go away while we are discarding.
2059          */
2060         btrfs_bio_counter_inc_blocked(root->fs_info);
2061         /* Tell the block device(s) that the sectors can be discarded */
2062         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
2063                               bytenr, &num_bytes, &bbio, 0);
2064         /* Error condition is -ENOMEM */
2065         if (!ret) {
2066                 struct btrfs_bio_stripe *stripe = bbio->stripes;
2067                 int i;
2068
2069
2070                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2071                         u64 bytes;
2072                         if (!stripe->dev->can_discard)
2073                                 continue;
2074
2075                         ret = btrfs_issue_discard(stripe->dev->bdev,
2076                                                   stripe->physical,
2077                                                   stripe->length,
2078                                                   &bytes);
2079                         if (!ret)
2080                                 discarded_bytes += bytes;
2081                         else if (ret != -EOPNOTSUPP)
2082                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2083
2084                         /*
2085                          * Just in case we get back EOPNOTSUPP for some reason,
2086                          * just ignore the return value so we don't screw up
2087                          * people calling discard_extent.
2088                          */
2089                         ret = 0;
2090                 }
2091                 btrfs_put_bbio(bbio);
2092         }
2093         btrfs_bio_counter_dec(root->fs_info);
2094
2095         if (actual_bytes)
2096                 *actual_bytes = discarded_bytes;
2097
2098
2099         if (ret == -EOPNOTSUPP)
2100                 ret = 0;
2101         return ret;
2102 }
2103
2104 /* Can return -ENOMEM */
2105 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2106                          struct btrfs_root *root,
2107                          u64 bytenr, u64 num_bytes, u64 parent,
2108                          u64 root_objectid, u64 owner, u64 offset)
2109 {
2110         int ret;
2111         struct btrfs_fs_info *fs_info = root->fs_info;
2112
2113         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2114                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2115
2116         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2117                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2118                                         num_bytes,
2119                                         parent, root_objectid, (int)owner,
2120                                         BTRFS_ADD_DELAYED_REF, NULL);
2121         } else {
2122                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2123                                         num_bytes, parent, root_objectid,
2124                                         owner, offset, 0,
2125                                         BTRFS_ADD_DELAYED_REF, NULL);
2126         }
2127         return ret;
2128 }
2129
2130 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2131                                   struct btrfs_root *root,
2132                                   struct btrfs_delayed_ref_node *node,
2133                                   u64 parent, u64 root_objectid,
2134                                   u64 owner, u64 offset, int refs_to_add,
2135                                   struct btrfs_delayed_extent_op *extent_op)
2136 {
2137         struct btrfs_fs_info *fs_info = root->fs_info;
2138         struct btrfs_path *path;
2139         struct extent_buffer *leaf;
2140         struct btrfs_extent_item *item;
2141         struct btrfs_key key;
2142         u64 bytenr = node->bytenr;
2143         u64 num_bytes = node->num_bytes;
2144         u64 refs;
2145         int ret;
2146
2147         path = btrfs_alloc_path();
2148         if (!path)
2149                 return -ENOMEM;
2150
2151         path->reada = READA_FORWARD;
2152         path->leave_spinning = 1;
2153         /* this will setup the path even if it fails to insert the back ref */
2154         ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2155                                            bytenr, num_bytes, parent,
2156                                            root_objectid, owner, offset,
2157                                            refs_to_add, extent_op);
2158         if ((ret < 0 && ret != -EAGAIN) || !ret)
2159                 goto out;
2160
2161         /*
2162          * Ok we had -EAGAIN which means we didn't have space to insert and
2163          * inline extent ref, so just update the reference count and add a
2164          * normal backref.
2165          */
2166         leaf = path->nodes[0];
2167         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2168         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2169         refs = btrfs_extent_refs(leaf, item);
2170         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2171         if (extent_op)
2172                 __run_delayed_extent_op(extent_op, leaf, item);
2173
2174         btrfs_mark_buffer_dirty(leaf);
2175         btrfs_release_path(path);
2176
2177         path->reada = READA_FORWARD;
2178         path->leave_spinning = 1;
2179         /* now insert the actual backref */
2180         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2181                                     path, bytenr, parent, root_objectid,
2182                                     owner, offset, refs_to_add);
2183         if (ret)
2184                 btrfs_abort_transaction(trans, ret);
2185 out:
2186         btrfs_free_path(path);
2187         return ret;
2188 }
2189
2190 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2191                                 struct btrfs_root *root,
2192                                 struct btrfs_delayed_ref_node *node,
2193                                 struct btrfs_delayed_extent_op *extent_op,
2194                                 int insert_reserved)
2195 {
2196         int ret = 0;
2197         struct btrfs_delayed_data_ref *ref;
2198         struct btrfs_key ins;
2199         u64 parent = 0;
2200         u64 ref_root = 0;
2201         u64 flags = 0;
2202
2203         ins.objectid = node->bytenr;
2204         ins.offset = node->num_bytes;
2205         ins.type = BTRFS_EXTENT_ITEM_KEY;
2206
2207         ref = btrfs_delayed_node_to_data_ref(node);
2208         trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
2209
2210         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2211                 parent = ref->parent;
2212         ref_root = ref->root;
2213
2214         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2215                 if (extent_op)
2216                         flags |= extent_op->flags_to_set;
2217                 ret = alloc_reserved_file_extent(trans, root,
2218                                                  parent, ref_root, flags,
2219                                                  ref->objectid, ref->offset,
2220                                                  &ins, node->ref_mod);
2221         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2222                 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2223                                              ref_root, ref->objectid,
2224                                              ref->offset, node->ref_mod,
2225                                              extent_op);
2226         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2227                 ret = __btrfs_free_extent(trans, root, node, parent,
2228                                           ref_root, ref->objectid,
2229                                           ref->offset, node->ref_mod,
2230                                           extent_op);
2231         } else {
2232                 BUG();
2233         }
2234         return ret;
2235 }
2236
2237 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2238                                     struct extent_buffer *leaf,
2239                                     struct btrfs_extent_item *ei)
2240 {
2241         u64 flags = btrfs_extent_flags(leaf, ei);
2242         if (extent_op->update_flags) {
2243                 flags |= extent_op->flags_to_set;
2244                 btrfs_set_extent_flags(leaf, ei, flags);
2245         }
2246
2247         if (extent_op->update_key) {
2248                 struct btrfs_tree_block_info *bi;
2249                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2250                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2251                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2252         }
2253 }
2254
2255 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2256                                  struct btrfs_root *root,
2257                                  struct btrfs_delayed_ref_node *node,
2258                                  struct btrfs_delayed_extent_op *extent_op)
2259 {
2260         struct btrfs_key key;
2261         struct btrfs_path *path;
2262         struct btrfs_extent_item *ei;
2263         struct extent_buffer *leaf;
2264         u32 item_size;
2265         int ret;
2266         int err = 0;
2267         int metadata = !extent_op->is_data;
2268
2269         if (trans->aborted)
2270                 return 0;
2271
2272         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2273                 metadata = 0;
2274
2275         path = btrfs_alloc_path();
2276         if (!path)
2277                 return -ENOMEM;
2278
2279         key.objectid = node->bytenr;
2280
2281         if (metadata) {
2282                 key.type = BTRFS_METADATA_ITEM_KEY;
2283                 key.offset = extent_op->level;
2284         } else {
2285                 key.type = BTRFS_EXTENT_ITEM_KEY;
2286                 key.offset = node->num_bytes;
2287         }
2288
2289 again:
2290         path->reada = READA_FORWARD;
2291         path->leave_spinning = 1;
2292         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2293                                 path, 0, 1);
2294         if (ret < 0) {
2295                 err = ret;
2296                 goto out;
2297         }
2298         if (ret > 0) {
2299                 if (metadata) {
2300                         if (path->slots[0] > 0) {
2301                                 path->slots[0]--;
2302                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2303                                                       path->slots[0]);
2304                                 if (key.objectid == node->bytenr &&
2305                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2306                                     key.offset == node->num_bytes)
2307                                         ret = 0;
2308                         }
2309                         if (ret > 0) {
2310                                 btrfs_release_path(path);
2311                                 metadata = 0;
2312
2313                                 key.objectid = node->bytenr;
2314                                 key.offset = node->num_bytes;
2315                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2316                                 goto again;
2317                         }
2318                 } else {
2319                         err = -EIO;
2320                         goto out;
2321                 }
2322         }
2323
2324         leaf = path->nodes[0];
2325         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2326 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2327         if (item_size < sizeof(*ei)) {
2328                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2329                                              path, (u64)-1, 0);
2330                 if (ret < 0) {
2331                         err = ret;
2332                         goto out;
2333                 }
2334                 leaf = path->nodes[0];
2335                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2336         }
2337 #endif
2338         BUG_ON(item_size < sizeof(*ei));
2339         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2340         __run_delayed_extent_op(extent_op, leaf, ei);
2341
2342         btrfs_mark_buffer_dirty(leaf);
2343 out:
2344         btrfs_free_path(path);
2345         return err;
2346 }
2347
2348 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2349                                 struct btrfs_root *root,
2350                                 struct btrfs_delayed_ref_node *node,
2351                                 struct btrfs_delayed_extent_op *extent_op,
2352                                 int insert_reserved)
2353 {
2354         int ret = 0;
2355         struct btrfs_delayed_tree_ref *ref;
2356         struct btrfs_key ins;
2357         u64 parent = 0;
2358         u64 ref_root = 0;
2359         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2360                                                  SKINNY_METADATA);
2361
2362         ref = btrfs_delayed_node_to_tree_ref(node);
2363         trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
2364
2365         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2366                 parent = ref->parent;
2367         ref_root = ref->root;
2368
2369         ins.objectid = node->bytenr;
2370         if (skinny_metadata) {
2371                 ins.offset = ref->level;
2372                 ins.type = BTRFS_METADATA_ITEM_KEY;
2373         } else {
2374                 ins.offset = node->num_bytes;
2375                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2376         }
2377
2378         BUG_ON(node->ref_mod != 1);
2379         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2380                 BUG_ON(!extent_op || !extent_op->update_flags);
2381                 ret = alloc_reserved_tree_block(trans, root,
2382                                                 parent, ref_root,
2383                                                 extent_op->flags_to_set,
2384                                                 &extent_op->key,
2385                                                 ref->level, &ins);
2386         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2387                 ret = __btrfs_inc_extent_ref(trans, root, node,
2388                                              parent, ref_root,
2389                                              ref->level, 0, 1,
2390                                              extent_op);
2391         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2392                 ret = __btrfs_free_extent(trans, root, node,
2393                                           parent, ref_root,
2394                                           ref->level, 0, 1, extent_op);
2395         } else {
2396                 BUG();
2397         }
2398         return ret;
2399 }
2400
2401 /* helper function to actually process a single delayed ref entry */
2402 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2403                                struct btrfs_root *root,
2404                                struct btrfs_delayed_ref_node *node,
2405                                struct btrfs_delayed_extent_op *extent_op,
2406                                int insert_reserved)
2407 {
2408         int ret = 0;
2409
2410         if (trans->aborted) {
2411                 if (insert_reserved)
2412                         btrfs_pin_extent(root, node->bytenr,
2413                                          node->num_bytes, 1);
2414                 return 0;
2415         }
2416
2417         if (btrfs_delayed_ref_is_head(node)) {
2418                 struct btrfs_delayed_ref_head *head;
2419                 /*
2420                  * we've hit the end of the chain and we were supposed
2421                  * to insert this extent into the tree.  But, it got
2422                  * deleted before we ever needed to insert it, so all
2423                  * we have to do is clean up the accounting
2424                  */
2425                 BUG_ON(extent_op);
2426                 head = btrfs_delayed_node_to_head(node);
2427                 trace_run_delayed_ref_head(root->fs_info, node, head,
2428                                            node->action);
2429
2430                 if (insert_reserved) {
2431                         btrfs_pin_extent(root, node->bytenr,
2432                                          node->num_bytes, 1);
2433                         if (head->is_data) {
2434                                 ret = btrfs_del_csums(trans, root,
2435                                                       node->bytenr,
2436                                                       node->num_bytes);
2437                         }
2438                 }
2439
2440                 /* Also free its reserved qgroup space */
2441                 btrfs_qgroup_free_delayed_ref(root->fs_info,
2442                                               head->qgroup_ref_root,
2443                                               head->qgroup_reserved);
2444                 return ret;
2445         }
2446
2447         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2448             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2449                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2450                                            insert_reserved);
2451         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2452                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2453                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2454                                            insert_reserved);
2455         else
2456                 BUG();
2457         return ret;
2458 }
2459
2460 static inline struct btrfs_delayed_ref_node *
2461 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2462 {
2463         struct btrfs_delayed_ref_node *ref;
2464
2465         if (list_empty(&head->ref_list))
2466                 return NULL;
2467
2468         /*
2469          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2470          * This is to prevent a ref count from going down to zero, which deletes
2471          * the extent item from the extent tree, when there still are references
2472          * to add, which would fail because they would not find the extent item.
2473          */
2474         list_for_each_entry(ref, &head->ref_list, list) {
2475                 if (ref->action == BTRFS_ADD_DELAYED_REF)
2476                         return ref;
2477         }
2478
2479         return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2480                           list);
2481 }
2482
2483 /*
2484  * Returns 0 on success or if called with an already aborted transaction.
2485  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2486  */
2487 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2488                                              struct btrfs_root *root,
2489                                              unsigned long nr)
2490 {
2491         struct btrfs_delayed_ref_root *delayed_refs;
2492         struct btrfs_delayed_ref_node *ref;
2493         struct btrfs_delayed_ref_head *locked_ref = NULL;
2494         struct btrfs_delayed_extent_op *extent_op;
2495         struct btrfs_fs_info *fs_info = root->fs_info;
2496         ktime_t start = ktime_get();
2497         int ret;
2498         unsigned long count = 0;
2499         unsigned long actual_count = 0;
2500         int must_insert_reserved = 0;
2501
2502         delayed_refs = &trans->transaction->delayed_refs;
2503         while (1) {
2504                 if (!locked_ref) {
2505                         if (count >= nr)
2506                                 break;
2507
2508                         spin_lock(&delayed_refs->lock);
2509                         locked_ref = btrfs_select_ref_head(trans);
2510                         if (!locked_ref) {
2511                                 spin_unlock(&delayed_refs->lock);
2512                                 break;
2513                         }
2514
2515                         /* grab the lock that says we are going to process
2516                          * all the refs for this head */
2517                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2518                         spin_unlock(&delayed_refs->lock);
2519                         /*
2520                          * we may have dropped the spin lock to get the head
2521                          * mutex lock, and that might have given someone else
2522                          * time to free the head.  If that's true, it has been
2523                          * removed from our list and we can move on.
2524                          */
2525                         if (ret == -EAGAIN) {
2526                                 locked_ref = NULL;
2527                                 count++;
2528                                 continue;
2529                         }
2530                 }
2531
2532                 /*
2533                  * We need to try and merge add/drops of the same ref since we
2534                  * can run into issues with relocate dropping the implicit ref
2535                  * and then it being added back again before the drop can
2536                  * finish.  If we merged anything we need to re-loop so we can
2537                  * get a good ref.
2538                  * Or we can get node references of the same type that weren't
2539                  * merged when created due to bumps in the tree mod seq, and
2540                  * we need to merge them to prevent adding an inline extent
2541                  * backref before dropping it (triggering a BUG_ON at
2542                  * insert_inline_extent_backref()).
2543                  */
2544                 spin_lock(&locked_ref->lock);
2545                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2546                                          locked_ref);
2547
2548                 /*
2549                  * locked_ref is the head node, so we have to go one
2550                  * node back for any delayed ref updates
2551                  */
2552                 ref = select_delayed_ref(locked_ref);
2553
2554                 if (ref && ref->seq &&
2555                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2556                         spin_unlock(&locked_ref->lock);
2557                         btrfs_delayed_ref_unlock(locked_ref);
2558                         spin_lock(&delayed_refs->lock);
2559                         locked_ref->processing = 0;
2560                         delayed_refs->num_heads_ready++;
2561                         spin_unlock(&delayed_refs->lock);
2562                         locked_ref = NULL;
2563                         cond_resched();
2564                         count++;
2565                         continue;
2566                 }
2567
2568                 /*
2569                  * record the must insert reserved flag before we
2570                  * drop the spin lock.
2571                  */
2572                 must_insert_reserved = locked_ref->must_insert_reserved;
2573                 locked_ref->must_insert_reserved = 0;
2574
2575                 extent_op = locked_ref->extent_op;
2576                 locked_ref->extent_op = NULL;
2577
2578                 if (!ref) {
2579
2580
2581                         /* All delayed refs have been processed, Go ahead
2582                          * and send the head node to run_one_delayed_ref,
2583                          * so that any accounting fixes can happen
2584                          */
2585                         ref = &locked_ref->node;
2586
2587                         if (extent_op && must_insert_reserved) {
2588                                 btrfs_free_delayed_extent_op(extent_op);
2589                                 extent_op = NULL;
2590                         }
2591
2592                         if (extent_op) {
2593                                 spin_unlock(&locked_ref->lock);
2594                                 ret = run_delayed_extent_op(trans, root,
2595                                                             ref, extent_op);
2596                                 btrfs_free_delayed_extent_op(extent_op);
2597
2598                                 if (ret) {
2599                                         /*
2600                                          * Need to reset must_insert_reserved if
2601                                          * there was an error so the abort stuff
2602                                          * can cleanup the reserved space
2603                                          * properly.
2604                                          */
2605                                         if (must_insert_reserved)
2606                                                 locked_ref->must_insert_reserved = 1;
2607                                         locked_ref->processing = 0;
2608                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2609                                         btrfs_delayed_ref_unlock(locked_ref);
2610                                         return ret;
2611                                 }
2612                                 continue;
2613                         }
2614
2615                         /*
2616                          * Need to drop our head ref lock and re-acquire the
2617                          * delayed ref lock and then re-check to make sure
2618                          * nobody got added.
2619                          */
2620                         spin_unlock(&locked_ref->lock);
2621                         spin_lock(&delayed_refs->lock);
2622                         spin_lock(&locked_ref->lock);
2623                         if (!list_empty(&locked_ref->ref_list) ||
2624                             locked_ref->extent_op) {
2625                                 spin_unlock(&locked_ref->lock);
2626                                 spin_unlock(&delayed_refs->lock);
2627                                 continue;
2628                         }
2629                         ref->in_tree = 0;
2630                         delayed_refs->num_heads--;
2631                         rb_erase(&locked_ref->href_node,
2632                                  &delayed_refs->href_root);
2633                         spin_unlock(&delayed_refs->lock);
2634                 } else {
2635                         actual_count++;
2636                         ref->in_tree = 0;
2637                         list_del(&ref->list);
2638                 }
2639                 atomic_dec(&delayed_refs->num_entries);
2640
2641                 if (!btrfs_delayed_ref_is_head(ref)) {
2642                         /*
2643                          * when we play the delayed ref, also correct the
2644                          * ref_mod on head
2645                          */
2646                         switch (ref->action) {
2647                         case BTRFS_ADD_DELAYED_REF:
2648                         case BTRFS_ADD_DELAYED_EXTENT:
2649                                 locked_ref->node.ref_mod -= ref->ref_mod;
2650                                 break;
2651                         case BTRFS_DROP_DELAYED_REF:
2652                                 locked_ref->node.ref_mod += ref->ref_mod;
2653                                 break;
2654                         default:
2655                                 WARN_ON(1);
2656                         }
2657                 }
2658                 spin_unlock(&locked_ref->lock);
2659
2660                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2661                                           must_insert_reserved);
2662
2663                 btrfs_free_delayed_extent_op(extent_op);
2664                 if (ret) {
2665                         locked_ref->processing = 0;
2666                         btrfs_delayed_ref_unlock(locked_ref);
2667                         btrfs_put_delayed_ref(ref);
2668                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2669                         return ret;
2670                 }
2671
2672                 /*
2673                  * If this node is a head, that means all the refs in this head
2674                  * have been dealt with, and we will pick the next head to deal
2675                  * with, so we must unlock the head and drop it from the cluster
2676                  * list before we release it.
2677                  */
2678                 if (btrfs_delayed_ref_is_head(ref)) {
2679                         if (locked_ref->is_data &&
2680                             locked_ref->total_ref_mod < 0) {
2681                                 spin_lock(&delayed_refs->lock);
2682                                 delayed_refs->pending_csums -= ref->num_bytes;
2683                                 spin_unlock(&delayed_refs->lock);
2684                         }
2685                         btrfs_delayed_ref_unlock(locked_ref);
2686                         locked_ref = NULL;
2687                 }
2688                 btrfs_put_delayed_ref(ref);
2689                 count++;
2690                 cond_resched();
2691         }
2692
2693         /*
2694          * We don't want to include ref heads since we can have empty ref heads
2695          * and those will drastically skew our runtime down since we just do
2696          * accounting, no actual extent tree updates.
2697          */
2698         if (actual_count > 0) {
2699                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2700                 u64 avg;
2701
2702                 /*
2703                  * We weigh the current average higher than our current runtime
2704                  * to avoid large swings in the average.
2705                  */
2706                 spin_lock(&delayed_refs->lock);
2707                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2708                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2709                 spin_unlock(&delayed_refs->lock);
2710         }
2711         return 0;
2712 }
2713
2714 #ifdef SCRAMBLE_DELAYED_REFS
2715 /*
2716  * Normally delayed refs get processed in ascending bytenr order. This
2717  * correlates in most cases to the order added. To expose dependencies on this
2718  * order, we start to process the tree in the middle instead of the beginning
2719  */
2720 static u64 find_middle(struct rb_root *root)
2721 {
2722         struct rb_node *n = root->rb_node;
2723         struct btrfs_delayed_ref_node *entry;
2724         int alt = 1;
2725         u64 middle;
2726         u64 first = 0, last = 0;
2727
2728         n = rb_first(root);
2729         if (n) {
2730                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2731                 first = entry->bytenr;
2732         }
2733         n = rb_last(root);
2734         if (n) {
2735                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2736                 last = entry->bytenr;
2737         }
2738         n = root->rb_node;
2739
2740         while (n) {
2741                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2742                 WARN_ON(!entry->in_tree);
2743
2744                 middle = entry->bytenr;
2745
2746                 if (alt)
2747                         n = n->rb_left;
2748                 else
2749                         n = n->rb_right;
2750
2751                 alt = 1 - alt;
2752         }
2753         return middle;
2754 }
2755 #endif
2756
2757 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2758 {
2759         u64 num_bytes;
2760
2761         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2762                              sizeof(struct btrfs_extent_inline_ref));
2763         if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2764                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2765
2766         /*
2767          * We don't ever fill up leaves all the way so multiply by 2 just to be
2768          * closer to what we're really going to want to use.
2769          */
2770         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2771 }
2772
2773 /*
2774  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2775  * would require to store the csums for that many bytes.
2776  */
2777 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2778 {
2779         u64 csum_size;
2780         u64 num_csums_per_leaf;
2781         u64 num_csums;
2782
2783         csum_size = BTRFS_MAX_ITEM_SIZE(root);
2784         num_csums_per_leaf = div64_u64(csum_size,
2785                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2786         num_csums = div64_u64(csum_bytes, root->sectorsize);
2787         num_csums += num_csums_per_leaf - 1;
2788         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2789         return num_csums;
2790 }
2791
2792 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2793                                        struct btrfs_root *root)
2794 {
2795         struct btrfs_block_rsv *global_rsv;
2796         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2797         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2798         u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2799         u64 num_bytes, num_dirty_bgs_bytes;
2800         int ret = 0;
2801
2802         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2803         num_heads = heads_to_leaves(root, num_heads);
2804         if (num_heads > 1)
2805                 num_bytes += (num_heads - 1) * root->nodesize;
2806         num_bytes <<= 1;
2807         num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2808         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2809                                                              num_dirty_bgs);
2810         global_rsv = &root->fs_info->global_block_rsv;
2811
2812         /*
2813          * If we can't allocate any more chunks lets make sure we have _lots_ of
2814          * wiggle room since running delayed refs can create more delayed refs.
2815          */
2816         if (global_rsv->space_info->full) {
2817                 num_dirty_bgs_bytes <<= 1;
2818                 num_bytes <<= 1;
2819         }
2820
2821         spin_lock(&global_rsv->lock);
2822         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2823                 ret = 1;
2824         spin_unlock(&global_rsv->lock);
2825         return ret;
2826 }
2827
2828 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2829                                        struct btrfs_root *root)
2830 {
2831         struct btrfs_fs_info *fs_info = root->fs_info;
2832         u64 num_entries =
2833                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2834         u64 avg_runtime;
2835         u64 val;
2836
2837         smp_mb();
2838         avg_runtime = fs_info->avg_delayed_ref_runtime;
2839         val = num_entries * avg_runtime;
2840         if (num_entries * avg_runtime >= NSEC_PER_SEC)
2841                 return 1;
2842         if (val >= NSEC_PER_SEC / 2)
2843                 return 2;
2844
2845         return btrfs_check_space_for_delayed_refs(trans, root);
2846 }
2847
2848 struct async_delayed_refs {
2849         struct btrfs_root *root;
2850         u64 transid;
2851         int count;
2852         int error;
2853         int sync;
2854         struct completion wait;
2855         struct btrfs_work work;
2856 };
2857
2858 static void delayed_ref_async_start(struct btrfs_work *work)
2859 {
2860         struct async_delayed_refs *async;
2861         struct btrfs_trans_handle *trans;
2862         int ret;
2863
2864         async = container_of(work, struct async_delayed_refs, work);
2865
2866         /* if the commit is already started, we don't need to wait here */
2867         if (btrfs_transaction_blocked(async->root->fs_info))
2868                 goto done;
2869
2870         trans = btrfs_join_transaction(async->root);
2871         if (IS_ERR(trans)) {
2872                 async->error = PTR_ERR(trans);
2873                 goto done;
2874         }
2875
2876         /*
2877          * trans->sync means that when we call end_transaction, we won't
2878          * wait on delayed refs
2879          */
2880         trans->sync = true;
2881
2882         /* Don't bother flushing if we got into a different transaction */
2883         if (trans->transid > async->transid)
2884                 goto end;
2885
2886         ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2887         if (ret)
2888                 async->error = ret;
2889 end:
2890         ret = btrfs_end_transaction(trans, async->root);
2891         if (ret && !async->error)
2892                 async->error = ret;
2893 done:
2894         if (async->sync)
2895                 complete(&async->wait);
2896         else
2897                 kfree(async);
2898 }
2899
2900 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2901                                  unsigned long count, u64 transid, int wait)
2902 {
2903         struct async_delayed_refs *async;
2904         int ret;
2905
2906         async = kmalloc(sizeof(*async), GFP_NOFS);
2907         if (!async)
2908                 return -ENOMEM;
2909
2910         async->root = root->fs_info->tree_root;
2911         async->count = count;
2912         async->error = 0;
2913         async->transid = transid;
2914         if (wait)
2915                 async->sync = 1;
2916         else
2917                 async->sync = 0;
2918         init_completion(&async->wait);
2919
2920         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2921                         delayed_ref_async_start, NULL, NULL);
2922
2923         btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2924
2925         if (wait) {
2926                 wait_for_completion(&async->wait);
2927                 ret = async->error;
2928                 kfree(async);
2929                 return ret;
2930         }
2931         return 0;
2932 }
2933
2934 /*
2935  * this starts processing the delayed reference count updates and
2936  * extent insertions we have queued up so far.  count can be
2937  * 0, which means to process everything in the tree at the start
2938  * of the run (but not newly added entries), or it can be some target
2939  * number you'd like to process.
2940  *
2941  * Returns 0 on success or if called with an aborted transaction
2942  * Returns <0 on error and aborts the transaction
2943  */
2944 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2945                            struct btrfs_root *root, unsigned long count)
2946 {
2947         struct rb_node *node;
2948         struct btrfs_delayed_ref_root *delayed_refs;
2949         struct btrfs_delayed_ref_head *head;
2950         int ret;
2951         int run_all = count == (unsigned long)-1;
2952         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2953
2954         /* We'll clean this up in btrfs_cleanup_transaction */
2955         if (trans->aborted)
2956                 return 0;
2957
2958         if (root->fs_info->creating_free_space_tree)
2959                 return 0;
2960
2961         if (root == root->fs_info->extent_root)
2962                 root = root->fs_info->tree_root;
2963
2964         delayed_refs = &trans->transaction->delayed_refs;
2965         if (count == 0)
2966                 count = atomic_read(&delayed_refs->num_entries) * 2;
2967
2968 again:
2969 #ifdef SCRAMBLE_DELAYED_REFS
2970         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2971 #endif
2972         trans->can_flush_pending_bgs = false;
2973         ret = __btrfs_run_delayed_refs(trans, root, count);
2974         if (ret < 0) {
2975                 btrfs_abort_transaction(trans, ret);
2976                 return ret;
2977         }
2978
2979         if (run_all) {
2980                 if (!list_empty(&trans->new_bgs))
2981                         btrfs_create_pending_block_groups(trans, root);
2982
2983                 spin_lock(&delayed_refs->lock);
2984                 node = rb_first(&delayed_refs->href_root);
2985                 if (!node) {
2986                         spin_unlock(&delayed_refs->lock);
2987                         goto out;
2988                 }
2989                 count = (unsigned long)-1;
2990
2991                 while (node) {
2992                         head = rb_entry(node, struct btrfs_delayed_ref_head,
2993                                         href_node);
2994                         if (btrfs_delayed_ref_is_head(&head->node)) {
2995                                 struct btrfs_delayed_ref_node *ref;
2996
2997                                 ref = &head->node;
2998                                 atomic_inc(&ref->refs);
2999
3000                                 spin_unlock(&delayed_refs->lock);
3001                                 /*
3002                                  * Mutex was contended, block until it's
3003                                  * released and try again
3004                                  */
3005                                 mutex_lock(&head->mutex);
3006                                 mutex_unlock(&head->mutex);
3007
3008                                 btrfs_put_delayed_ref(ref);
3009                                 cond_resched();
3010                                 goto again;
3011                         } else {
3012                                 WARN_ON(1);
3013                         }
3014                         node = rb_next(node);
3015                 }
3016                 spin_unlock(&delayed_refs->lock);
3017                 cond_resched();
3018                 goto again;
3019         }
3020 out:
3021         assert_qgroups_uptodate(trans);
3022         trans->can_flush_pending_bgs = can_flush_pending_bgs;
3023         return 0;
3024 }
3025
3026 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3027                                 struct btrfs_root *root,
3028                                 u64 bytenr, u64 num_bytes, u64 flags,
3029                                 int level, int is_data)
3030 {
3031         struct btrfs_delayed_extent_op *extent_op;
3032         int ret;
3033
3034         extent_op = btrfs_alloc_delayed_extent_op();
3035         if (!extent_op)
3036                 return -ENOMEM;
3037
3038         extent_op->flags_to_set = flags;
3039         extent_op->update_flags = true;
3040         extent_op->update_key = false;
3041         extent_op->is_data = is_data ? true : false;
3042         extent_op->level = level;
3043
3044         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
3045                                           num_bytes, extent_op);
3046         if (ret)
3047                 btrfs_free_delayed_extent_op(extent_op);
3048         return ret;
3049 }
3050
3051 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
3052                                       struct btrfs_root *root,
3053                                       struct btrfs_path *path,
3054                                       u64 objectid, u64 offset, u64 bytenr)
3055 {
3056         struct btrfs_delayed_ref_head *head;
3057         struct btrfs_delayed_ref_node *ref;
3058         struct btrfs_delayed_data_ref *data_ref;
3059         struct btrfs_delayed_ref_root *delayed_refs;
3060         int ret = 0;
3061
3062         delayed_refs = &trans->transaction->delayed_refs;
3063         spin_lock(&delayed_refs->lock);
3064         head = btrfs_find_delayed_ref_head(trans, bytenr);
3065         if (!head) {
3066                 spin_unlock(&delayed_refs->lock);
3067                 return 0;
3068         }
3069
3070         if (!mutex_trylock(&head->mutex)) {
3071                 atomic_inc(&head->node.refs);
3072                 spin_unlock(&delayed_refs->lock);
3073
3074                 btrfs_release_path(path);
3075
3076                 /*
3077                  * Mutex was contended, block until it's released and let
3078                  * caller try again
3079                  */
3080                 mutex_lock(&head->mutex);
3081                 mutex_unlock(&head->mutex);
3082                 btrfs_put_delayed_ref(&head->node);
3083                 return -EAGAIN;
3084         }
3085         spin_unlock(&delayed_refs->lock);
3086
3087         spin_lock(&head->lock);
3088         list_for_each_entry(ref, &head->ref_list, list) {
3089                 /* If it's a shared ref we know a cross reference exists */
3090                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3091                         ret = 1;
3092                         break;
3093                 }
3094
3095                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3096
3097                 /*
3098                  * If our ref doesn't match the one we're currently looking at
3099                  * then we have a cross reference.
3100                  */
3101                 if (data_ref->root != root->root_key.objectid ||
3102                     data_ref->objectid != objectid ||
3103                     data_ref->offset != offset) {
3104                         ret = 1;
3105                         break;
3106                 }
3107         }
3108         spin_unlock(&head->lock);
3109         mutex_unlock(&head->mutex);
3110         return ret;
3111 }
3112
3113 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
3114                                         struct btrfs_root *root,
3115                                         struct btrfs_path *path,
3116                                         u64 objectid, u64 offset, u64 bytenr)
3117 {
3118         struct btrfs_root *extent_root = root->fs_info->extent_root;
3119         struct extent_buffer *leaf;
3120         struct btrfs_extent_data_ref *ref;
3121         struct btrfs_extent_inline_ref *iref;
3122         struct btrfs_extent_item *ei;
3123         struct btrfs_key key;
3124         u32 item_size;
3125         int ret;
3126
3127         key.objectid = bytenr;
3128         key.offset = (u64)-1;
3129         key.type = BTRFS_EXTENT_ITEM_KEY;
3130
3131         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3132         if (ret < 0)
3133                 goto out;
3134         BUG_ON(ret == 0); /* Corruption */
3135
3136         ret = -ENOENT;
3137         if (path->slots[0] == 0)
3138                 goto out;
3139
3140         path->slots[0]--;
3141         leaf = path->nodes[0];
3142         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3143
3144         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3145                 goto out;
3146
3147         ret = 1;
3148         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3149 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3150         if (item_size < sizeof(*ei)) {
3151                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3152                 goto out;
3153         }
3154 #endif
3155         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3156
3157         if (item_size != sizeof(*ei) +
3158             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3159                 goto out;
3160
3161         if (btrfs_extent_generation(leaf, ei) <=
3162             btrfs_root_last_snapshot(&root->root_item))
3163                 goto out;
3164
3165         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3166         if (btrfs_extent_inline_ref_type(leaf, iref) !=
3167             BTRFS_EXTENT_DATA_REF_KEY)
3168                 goto out;
3169
3170         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3171         if (btrfs_extent_refs(leaf, ei) !=
3172             btrfs_extent_data_ref_count(leaf, ref) ||
3173             btrfs_extent_data_ref_root(leaf, ref) !=
3174             root->root_key.objectid ||
3175             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3176             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3177                 goto out;
3178
3179         ret = 0;
3180 out:
3181         return ret;
3182 }
3183
3184 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3185                           struct btrfs_root *root,
3186                           u64 objectid, u64 offset, u64 bytenr)
3187 {
3188         struct btrfs_path *path;
3189         int ret;
3190         int ret2;
3191
3192         path = btrfs_alloc_path();
3193         if (!path)
3194                 return -ENOENT;
3195
3196         do {
3197                 ret = check_committed_ref(trans, root, path, objectid,
3198                                           offset, bytenr);
3199                 if (ret && ret != -ENOENT)
3200                         goto out;
3201
3202                 ret2 = check_delayed_ref(trans, root, path, objectid,
3203                                          offset, bytenr);
3204         } while (ret2 == -EAGAIN);
3205
3206         if (ret2 && ret2 != -ENOENT) {
3207                 ret = ret2;
3208                 goto out;
3209         }
3210
3211         if (ret != -ENOENT || ret2 != -ENOENT)
3212                 ret = 0;
3213 out:
3214         btrfs_free_path(path);
3215         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3216                 WARN_ON(ret > 0);
3217         return ret;
3218 }
3219
3220 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3221                            struct btrfs_root *root,
3222                            struct extent_buffer *buf,
3223                            int full_backref, int inc)
3224 {
3225         u64 bytenr;
3226         u64 num_bytes;
3227         u64 parent;
3228         u64 ref_root;
3229         u32 nritems;
3230         struct btrfs_key key;
3231         struct btrfs_file_extent_item *fi;
3232         int i;
3233         int level;
3234         int ret = 0;
3235         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3236                             u64, u64, u64, u64, u64, u64);
3237
3238
3239         if (btrfs_is_testing(root->fs_info))
3240                 return 0;
3241
3242         ref_root = btrfs_header_owner(buf);
3243         nritems = btrfs_header_nritems(buf);
3244         level = btrfs_header_level(buf);
3245
3246         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3247                 return 0;
3248
3249         if (inc)
3250                 process_func = btrfs_inc_extent_ref;
3251         else
3252                 process_func = btrfs_free_extent;
3253
3254         if (full_backref)
3255                 parent = buf->start;
3256         else
3257                 parent = 0;
3258
3259         for (i = 0; i < nritems; i++) {
3260                 if (level == 0) {
3261                         btrfs_item_key_to_cpu(buf, &key, i);
3262                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3263                                 continue;
3264                         fi = btrfs_item_ptr(buf, i,
3265                                             struct btrfs_file_extent_item);
3266                         if (btrfs_file_extent_type(buf, fi) ==
3267                             BTRFS_FILE_EXTENT_INLINE)
3268                                 continue;
3269                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3270                         if (bytenr == 0)
3271                                 continue;
3272
3273                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3274                         key.offset -= btrfs_file_extent_offset(buf, fi);
3275                         ret = process_func(trans, root, bytenr, num_bytes,
3276                                            parent, ref_root, key.objectid,
3277                                            key.offset);
3278                         if (ret)
3279                                 goto fail;
3280                 } else {
3281                         bytenr = btrfs_node_blockptr(buf, i);
3282                         num_bytes = root->nodesize;
3283                         ret = process_func(trans, root, bytenr, num_bytes,
3284                                            parent, ref_root, level - 1, 0);
3285                         if (ret)
3286                                 goto fail;
3287                 }
3288         }
3289         return 0;
3290 fail:
3291         return ret;
3292 }
3293
3294 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3295                   struct extent_buffer *buf, int full_backref)
3296 {
3297         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3298 }
3299
3300 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3301                   struct extent_buffer *buf, int full_backref)
3302 {
3303         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3304 }
3305
3306 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3307                                  struct btrfs_root *root,
3308                                  struct btrfs_path *path,
3309                                  struct btrfs_block_group_cache *cache)
3310 {
3311         int ret;
3312         struct btrfs_root *extent_root = root->fs_info->extent_root;
3313         unsigned long bi;
3314         struct extent_buffer *leaf;
3315
3316         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3317         if (ret) {
3318                 if (ret > 0)
3319                         ret = -ENOENT;
3320                 goto fail;
3321         }
3322
3323         leaf = path->nodes[0];
3324         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3325         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3326         btrfs_mark_buffer_dirty(leaf);
3327 fail:
3328         btrfs_release_path(path);
3329         return ret;
3330
3331 }
3332
3333 static struct btrfs_block_group_cache *
3334 next_block_group(struct btrfs_root *root,
3335                  struct btrfs_block_group_cache *cache)
3336 {
3337         struct rb_node *node;
3338
3339         spin_lock(&root->fs_info->block_group_cache_lock);
3340
3341         /* If our block group was removed, we need a full search. */
3342         if (RB_EMPTY_NODE(&cache->cache_node)) {
3343                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3344
3345                 spin_unlock(&root->fs_info->block_group_cache_lock);
3346                 btrfs_put_block_group(cache);
3347                 cache = btrfs_lookup_first_block_group(root->fs_info,
3348                                                        next_bytenr);
3349                 return cache;
3350         }
3351         node = rb_next(&cache->cache_node);
3352         btrfs_put_block_group(cache);
3353         if (node) {
3354                 cache = rb_entry(node, struct btrfs_block_group_cache,
3355                                  cache_node);
3356                 btrfs_get_block_group(cache);
3357         } else
3358                 cache = NULL;
3359         spin_unlock(&root->fs_info->block_group_cache_lock);
3360         return cache;
3361 }
3362
3363 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3364                             struct btrfs_trans_handle *trans,
3365                             struct btrfs_path *path)
3366 {
3367         struct btrfs_root *root = block_group->fs_info->tree_root;
3368         struct inode *inode = NULL;
3369         u64 alloc_hint = 0;
3370         int dcs = BTRFS_DC_ERROR;
3371         u64 num_pages = 0;
3372         int retries = 0;
3373         int ret = 0;
3374
3375         /*
3376          * If this block group is smaller than 100 megs don't bother caching the
3377          * block group.
3378          */
3379         if (block_group->key.offset < (100 * SZ_1M)) {
3380                 spin_lock(&block_group->lock);
3381                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3382                 spin_unlock(&block_group->lock);
3383                 return 0;
3384         }
3385
3386         if (trans->aborted)
3387                 return 0;
3388 again:
3389         inode = lookup_free_space_inode(root, block_group, path);
3390         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3391                 ret = PTR_ERR(inode);
3392                 btrfs_release_path(path);
3393                 goto out;
3394         }
3395
3396         if (IS_ERR(inode)) {
3397                 BUG_ON(retries);
3398                 retries++;
3399
3400                 if (block_group->ro)
3401                         goto out_free;
3402
3403                 ret = create_free_space_inode(root, trans, block_group, path);
3404                 if (ret)
3405                         goto out_free;
3406                 goto again;
3407         }
3408
3409         /* We've already setup this transaction, go ahead and exit */
3410         if (block_group->cache_generation == trans->transid &&
3411             i_size_read(inode)) {
3412                 dcs = BTRFS_DC_SETUP;
3413                 goto out_put;
3414         }
3415
3416         /*
3417          * We want to set the generation to 0, that way if anything goes wrong
3418          * from here on out we know not to trust this cache when we load up next
3419          * time.
3420          */
3421         BTRFS_I(inode)->generation = 0;
3422         ret = btrfs_update_inode(trans, root, inode);
3423         if (ret) {
3424                 /*
3425                  * So theoretically we could recover from this, simply set the
3426                  * super cache generation to 0 so we know to invalidate the
3427                  * cache, but then we'd have to keep track of the block groups
3428                  * that fail this way so we know we _have_ to reset this cache
3429                  * before the next commit or risk reading stale cache.  So to
3430                  * limit our exposure to horrible edge cases lets just abort the
3431                  * transaction, this only happens in really bad situations
3432                  * anyway.
3433                  */
3434                 btrfs_abort_transaction(trans, ret);
3435                 goto out_put;
3436         }
3437         WARN_ON(ret);
3438
3439         if (i_size_read(inode) > 0) {
3440                 ret = btrfs_check_trunc_cache_free_space(root,
3441                                         &root->fs_info->global_block_rsv);
3442                 if (ret)
3443                         goto out_put;
3444
3445                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3446                 if (ret)
3447                         goto out_put;
3448         }
3449
3450         spin_lock(&block_group->lock);
3451         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3452             !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
3453                 /*
3454                  * don't bother trying to write stuff out _if_
3455                  * a) we're not cached,
3456                  * b) we're with nospace_cache mount option.
3457                  */
3458                 dcs = BTRFS_DC_WRITTEN;
3459                 spin_unlock(&block_group->lock);
3460                 goto out_put;
3461         }
3462         spin_unlock(&block_group->lock);
3463
3464         /*
3465          * We hit an ENOSPC when setting up the cache in this transaction, just
3466          * skip doing the setup, we've already cleared the cache so we're safe.
3467          */
3468         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3469                 ret = -ENOSPC;
3470                 goto out_put;
3471         }
3472
3473         /*
3474          * Try to preallocate enough space based on how big the block group is.
3475          * Keep in mind this has to include any pinned space which could end up
3476          * taking up quite a bit since it's not folded into the other space
3477          * cache.
3478          */
3479         num_pages = div_u64(block_group->key.offset, SZ_256M);
3480         if (!num_pages)
3481                 num_pages = 1;
3482
3483         num_pages *= 16;
3484         num_pages *= PAGE_SIZE;
3485
3486         ret = btrfs_check_data_free_space(inode, 0, num_pages);
3487         if (ret)
3488                 goto out_put;
3489
3490         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3491                                               num_pages, num_pages,
3492                                               &alloc_hint);
3493         /*
3494          * Our cache requires contiguous chunks so that we don't modify a bunch
3495          * of metadata or split extents when writing the cache out, which means
3496          * we can enospc if we are heavily fragmented in addition to just normal
3497          * out of space conditions.  So if we hit this just skip setting up any
3498          * other block groups for this transaction, maybe we'll unpin enough
3499          * space the next time around.
3500          */
3501         if (!ret)
3502                 dcs = BTRFS_DC_SETUP;
3503         else if (ret == -ENOSPC)
3504                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3505         btrfs_free_reserved_data_space(inode, 0, num_pages);
3506
3507 out_put:
3508         iput(inode);
3509 out_free:
3510         btrfs_release_path(path);
3511 out:
3512         spin_lock(&block_group->lock);
3513         if (!ret && dcs == BTRFS_DC_SETUP)
3514                 block_group->cache_generation = trans->transid;
3515         block_group->disk_cache_state = dcs;
3516         spin_unlock(&block_group->lock);
3517
3518         return ret;
3519 }
3520
3521 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3522                             struct btrfs_root *root)
3523 {
3524         struct btrfs_block_group_cache *cache, *tmp;
3525         struct btrfs_transaction *cur_trans = trans->transaction;
3526         struct btrfs_path *path;
3527
3528         if (list_empty(&cur_trans->dirty_bgs) ||
3529             !btrfs_test_opt(root->fs_info, SPACE_CACHE))
3530                 return 0;
3531
3532         path = btrfs_alloc_path();
3533         if (!path)
3534                 return -ENOMEM;
3535
3536         /* Could add new block groups, use _safe just in case */
3537         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3538                                  dirty_list) {
3539                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3540                         cache_save_setup(cache, trans, path);
3541         }
3542
3543         btrfs_free_path(path);
3544         return 0;
3545 }
3546
3547 /*
3548  * transaction commit does final block group cache writeback during a
3549  * critical section where nothing is allowed to change the FS.  This is
3550  * required in order for the cache to actually match the block group,
3551  * but can introduce a lot of latency into the commit.
3552  *
3553  * So, btrfs_start_dirty_block_groups is here to kick off block group
3554  * cache IO.  There's a chance we'll have to redo some of it if the
3555  * block group changes again during the commit, but it greatly reduces
3556  * the commit latency by getting rid of the easy block groups while
3557  * we're still allowing others to join the commit.
3558  */
3559 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3560                                    struct btrfs_root *root)
3561 {
3562         struct btrfs_block_group_cache *cache;
3563         struct btrfs_transaction *cur_trans = trans->transaction;
3564         int ret = 0;
3565         int should_put;
3566         struct btrfs_path *path = NULL;
3567         LIST_HEAD(dirty);
3568         struct list_head *io = &cur_trans->io_bgs;
3569         int num_started = 0;
3570         int loops = 0;
3571
3572         spin_lock(&cur_trans->dirty_bgs_lock);
3573         if (list_empty(&cur_trans->dirty_bgs)) {
3574                 spin_unlock(&cur_trans->dirty_bgs_lock);
3575                 return 0;
3576         }
3577         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3578         spin_unlock(&cur_trans->dirty_bgs_lock);
3579
3580 again:
3581         /*
3582          * make sure all the block groups on our dirty list actually
3583          * exist
3584          */
3585         btrfs_create_pending_block_groups(trans, root);
3586
3587         if (!path) {
3588                 path = btrfs_alloc_path();
3589                 if (!path)
3590                         return -ENOMEM;
3591         }
3592
3593         /*
3594          * cache_write_mutex is here only to save us from balance or automatic
3595          * removal of empty block groups deleting this block group while we are
3596          * writing out the cache
3597          */
3598         mutex_lock(&trans->transaction->cache_write_mutex);
3599         while (!list_empty(&dirty)) {
3600                 cache = list_first_entry(&dirty,
3601                                          struct btrfs_block_group_cache,
3602                                          dirty_list);
3603                 /*
3604                  * this can happen if something re-dirties a block
3605                  * group that is already under IO.  Just wait for it to
3606                  * finish and then do it all again
3607                  */
3608                 if (!list_empty(&cache->io_list)) {
3609                         list_del_init(&cache->io_list);
3610                         btrfs_wait_cache_io(root, trans, cache,
3611                                             &cache->io_ctl, path,
3612                                             cache->key.objectid);
3613                         btrfs_put_block_group(cache);
3614                 }
3615
3616
3617                 /*
3618                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3619                  * if it should update the cache_state.  Don't delete
3620                  * until after we wait.
3621                  *
3622                  * Since we're not running in the commit critical section
3623                  * we need the dirty_bgs_lock to protect from update_block_group
3624                  */
3625                 spin_lock(&cur_trans->dirty_bgs_lock);
3626                 list_del_init(&cache->dirty_list);
3627                 spin_unlock(&cur_trans->dirty_bgs_lock);
3628
3629                 should_put = 1;
3630
3631                 cache_save_setup(cache, trans, path);
3632
3633                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3634                         cache->io_ctl.inode = NULL;
3635                         ret = btrfs_write_out_cache(root, trans, cache, path);
3636                         if (ret == 0 && cache->io_ctl.inode) {
3637                                 num_started++;
3638                                 should_put = 0;
3639
3640                                 /*
3641                                  * the cache_write_mutex is protecting
3642                                  * the io_list
3643                                  */
3644                                 list_add_tail(&cache->io_list, io);
3645                         } else {
3646                                 /*
3647                                  * if we failed to write the cache, the
3648                                  * generation will be bad and life goes on
3649                                  */
3650                                 ret = 0;
3651                         }
3652                 }
3653                 if (!ret) {
3654                         ret = write_one_cache_group(trans, root, path, cache);
3655                         /*
3656                          * Our block group might still be attached to the list
3657                          * of new block groups in the transaction handle of some
3658                          * other task (struct btrfs_trans_handle->new_bgs). This
3659                          * means its block group item isn't yet in the extent
3660                          * tree. If this happens ignore the error, as we will
3661                          * try again later in the critical section of the
3662                          * transaction commit.
3663                          */
3664                         if (ret == -ENOENT) {
3665                                 ret = 0;
3666                                 spin_lock(&cur_trans->dirty_bgs_lock);
3667                                 if (list_empty(&cache->dirty_list)) {
3668                                         list_add_tail(&cache->dirty_list,
3669                                                       &cur_trans->dirty_bgs);
3670                                         btrfs_get_block_group(cache);
3671                                 }
3672                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3673                         } else if (ret) {
3674                                 btrfs_abort_transaction(trans, ret);
3675                         }
3676                 }
3677
3678                 /* if its not on the io list, we need to put the block group */
3679                 if (should_put)
3680                         btrfs_put_block_group(cache);
3681
3682                 if (ret)
3683                         break;
3684
3685                 /*
3686                  * Avoid blocking other tasks for too long. It might even save
3687                  * us from writing caches for block groups that are going to be
3688                  * removed.
3689                  */
3690                 mutex_unlock(&trans->transaction->cache_write_mutex);
3691                 mutex_lock(&trans->transaction->cache_write_mutex);
3692         }
3693         mutex_unlock(&trans->transaction->cache_write_mutex);
3694
3695         /*
3696          * go through delayed refs for all the stuff we've just kicked off
3697          * and then loop back (just once)
3698          */
3699         ret = btrfs_run_delayed_refs(trans, root, 0);
3700         if (!ret && loops == 0) {
3701                 loops++;
3702                 spin_lock(&cur_trans->dirty_bgs_lock);
3703                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3704                 /*
3705                  * dirty_bgs_lock protects us from concurrent block group
3706                  * deletes too (not just cache_write_mutex).
3707                  */
3708                 if (!list_empty(&dirty)) {
3709                         spin_unlock(&cur_trans->dirty_bgs_lock);
3710                         goto again;
3711                 }
3712                 spin_unlock(&cur_trans->dirty_bgs_lock);
3713         }
3714
3715         btrfs_free_path(path);
3716         return ret;
3717 }
3718
3719 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3720                                    struct btrfs_root *root)
3721 {
3722         struct btrfs_block_group_cache *cache;
3723         struct btrfs_transaction *cur_trans = trans->transaction;
3724         int ret = 0;
3725         int should_put;
3726         struct btrfs_path *path;
3727         struct list_head *io = &cur_trans->io_bgs;
3728         int num_started = 0;
3729
3730         path = btrfs_alloc_path();
3731         if (!path)
3732                 return -ENOMEM;
3733
3734         /*
3735          * Even though we are in the critical section of the transaction commit,
3736          * we can still have concurrent tasks adding elements to this
3737          * transaction's list of dirty block groups. These tasks correspond to
3738          * endio free space workers started when writeback finishes for a
3739          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3740          * allocate new block groups as a result of COWing nodes of the root
3741          * tree when updating the free space inode. The writeback for the space
3742          * caches is triggered by an earlier call to
3743          * btrfs_start_dirty_block_groups() and iterations of the following
3744          * loop.
3745          * Also we want to do the cache_save_setup first and then run the
3746          * delayed refs to make sure we have the best chance at doing this all
3747          * in one shot.
3748          */
3749         spin_lock(&cur_trans->dirty_bgs_lock);
3750         while (!list_empty(&cur_trans->dirty_bgs)) {
3751                 cache = list_first_entry(&cur_trans->dirty_bgs,
3752                                          struct btrfs_block_group_cache,
3753                                          dirty_list);
3754
3755                 /*
3756                  * this can happen if cache_save_setup re-dirties a block
3757                  * group that is already under IO.  Just wait for it to
3758                  * finish and then do it all again
3759                  */
3760                 if (!list_empty(&cache->io_list)) {
3761                         spin_unlock(&cur_trans->dirty_bgs_lock);
3762                         list_del_init(&cache->io_list);
3763                         btrfs_wait_cache_io(root, trans, cache,
3764                                             &cache->io_ctl, path,
3765                                             cache->key.objectid);
3766                         btrfs_put_block_group(cache);
3767                         spin_lock(&cur_trans->dirty_bgs_lock);
3768                 }
3769
3770                 /*
3771                  * don't remove from the dirty list until after we've waited
3772                  * on any pending IO
3773                  */
3774                 list_del_init(&cache->dirty_list);
3775                 spin_unlock(&cur_trans->dirty_bgs_lock);
3776                 should_put = 1;
3777
3778                 cache_save_setup(cache, trans, path);
3779
3780                 if (!ret)
3781                         ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3782
3783                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3784                         cache->io_ctl.inode = NULL;
3785                         ret = btrfs_write_out_cache(root, trans, cache, path);
3786                         if (ret == 0 && cache->io_ctl.inode) {
3787                                 num_started++;
3788                                 should_put = 0;
3789                                 list_add_tail(&cache->io_list, io);
3790                         } else {
3791                                 /*
3792                                  * if we failed to write the cache, the
3793                                  * generation will be bad and life goes on
3794                                  */
3795                                 ret = 0;
3796                         }
3797                 }
3798                 if (!ret) {
3799                         ret = write_one_cache_group(trans, root, path, cache);
3800                         /*
3801                          * One of the free space endio workers might have
3802                          * created a new block group while updating a free space
3803                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3804                          * and hasn't released its transaction handle yet, in
3805                          * which case the new block group is still attached to
3806                          * its transaction handle and its creation has not
3807                          * finished yet (no block group item in the extent tree
3808                          * yet, etc). If this is the case, wait for all free
3809                          * space endio workers to finish and retry. This is a
3810                          * a very rare case so no need for a more efficient and
3811                          * complex approach.
3812                          */
3813                         if (ret == -ENOENT) {
3814                                 wait_event(cur_trans->writer_wait,
3815                                    atomic_read(&cur_trans->num_writers) == 1);
3816                                 ret = write_one_cache_group(trans, root, path,
3817                                                             cache);
3818                         }
3819                         if (ret)
3820                                 btrfs_abort_transaction(trans, ret);
3821                 }
3822
3823                 /* if its not on the io list, we need to put the block group */
3824                 if (should_put)
3825                         btrfs_put_block_group(cache);
3826                 spin_lock(&cur_trans->dirty_bgs_lock);
3827         }
3828         spin_unlock(&cur_trans->dirty_bgs_lock);
3829
3830         while (!list_empty(io)) {
3831                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3832                                          io_list);
3833                 list_del_init(&cache->io_list);
3834                 btrfs_wait_cache_io(root, trans, cache,
3835                                     &cache->io_ctl, path, cache->key.objectid);
3836                 btrfs_put_block_group(cache);
3837         }
3838
3839         btrfs_free_path(path);
3840         return ret;
3841 }
3842
3843 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3844 {
3845         struct btrfs_block_group_cache *block_group;
3846         int readonly = 0;
3847
3848         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3849         if (!block_group || block_group->ro)
3850                 readonly = 1;
3851         if (block_group)
3852                 btrfs_put_block_group(block_group);
3853         return readonly;
3854 }
3855
3856 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3857 {
3858         struct btrfs_block_group_cache *bg;
3859         bool ret = true;
3860
3861         bg = btrfs_lookup_block_group(fs_info, bytenr);
3862         if (!bg)
3863                 return false;
3864
3865         spin_lock(&bg->lock);
3866         if (bg->ro)
3867                 ret = false;
3868         else
3869                 atomic_inc(&bg->nocow_writers);
3870         spin_unlock(&bg->lock);
3871
3872         /* no put on block group, done by btrfs_dec_nocow_writers */
3873         if (!ret)
3874                 btrfs_put_block_group(bg);
3875
3876         return ret;
3877
3878 }
3879
3880 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3881 {
3882         struct btrfs_block_group_cache *bg;
3883
3884         bg = btrfs_lookup_block_group(fs_info, bytenr);
3885         ASSERT(bg);
3886         if (atomic_dec_and_test(&bg->nocow_writers))
3887                 wake_up_atomic_t(&bg->nocow_writers);
3888         /*
3889          * Once for our lookup and once for the lookup done by a previous call
3890          * to btrfs_inc_nocow_writers()
3891          */
3892         btrfs_put_block_group(bg);
3893         btrfs_put_block_group(bg);
3894 }
3895
3896 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3897 {
3898         schedule();
3899         return 0;
3900 }
3901
3902 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3903 {
3904         wait_on_atomic_t(&bg->nocow_writers,
3905                          btrfs_wait_nocow_writers_atomic_t,
3906                          TASK_UNINTERRUPTIBLE);
3907 }
3908
3909 static const char *alloc_name(u64 flags)
3910 {
3911         switch (flags) {
3912         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3913                 return "mixed";
3914         case BTRFS_BLOCK_GROUP_METADATA:
3915                 return "metadata";
3916         case BTRFS_BLOCK_GROUP_DATA:
3917                 return "data";
3918         case BTRFS_BLOCK_GROUP_SYSTEM:
3919                 return "system";
3920         default:
3921                 WARN_ON(1);
3922                 return "invalid-combination";
3923         };
3924 }
3925
3926 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3927                              u64 total_bytes, u64 bytes_used,
3928                              u64 bytes_readonly,
3929                              struct btrfs_space_info **space_info)
3930 {
3931         struct btrfs_space_info *found;
3932         int i;
3933         int factor;
3934         int ret;
3935
3936         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3937                      BTRFS_BLOCK_GROUP_RAID10))
3938                 factor = 2;
3939         else
3940                 factor = 1;
3941
3942         found = __find_space_info(info, flags);
3943         if (found) {
3944                 spin_lock(&found->lock);
3945                 found->total_bytes += total_bytes;
3946                 found->disk_total += total_bytes * factor;
3947                 found->bytes_used += bytes_used;
3948                 found->disk_used += bytes_used * factor;
3949                 found->bytes_readonly += bytes_readonly;
3950                 if (total_bytes > 0)
3951                         found->full = 0;
3952                 space_info_add_new_bytes(info, found, total_bytes -
3953                                          bytes_used - bytes_readonly);
3954                 spin_unlock(&found->lock);
3955                 *space_info = found;
3956                 return 0;
3957         }
3958         found = kzalloc(sizeof(*found), GFP_NOFS);
3959         if (!found)
3960                 return -ENOMEM;
3961
3962         ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3963         if (ret) {
3964                 kfree(found);
3965                 return ret;
3966         }
3967
3968         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3969                 INIT_LIST_HEAD(&found->block_groups[i]);
3970         init_rwsem(&found->groups_sem);
3971         spin_lock_init(&found->lock);
3972         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3973         found->total_bytes = total_bytes;
3974         found->disk_total = total_bytes * factor;
3975         found->bytes_used = bytes_used;
3976         found->disk_used = bytes_used * factor;
3977         found->bytes_pinned = 0;
3978         found->bytes_reserved = 0;
3979         found->bytes_readonly = bytes_readonly;
3980         found->bytes_may_use = 0;
3981         found->full = 0;
3982         found->max_extent_size = 0;
3983         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3984         found->chunk_alloc = 0;
3985         found->flush = 0;
3986         init_waitqueue_head(&found->wait);
3987         INIT_LIST_HEAD(&found->ro_bgs);
3988         INIT_LIST_HEAD(&found->tickets);
3989         INIT_LIST_HEAD(&found->priority_tickets);
3990
3991         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3992                                     info->space_info_kobj, "%s",
3993                                     alloc_name(found->flags));
3994         if (ret) {
3995                 kfree(found);
3996                 return ret;
3997         }
3998
3999         *space_info = found;
4000         list_add_rcu(&found->list, &info->space_info);
4001         if (flags & BTRFS_BLOCK_GROUP_DATA)
4002                 info->data_sinfo = found;
4003
4004         return ret;
4005 }
4006
4007 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4008 {
4009         u64 extra_flags = chunk_to_extended(flags) &
4010                                 BTRFS_EXTENDED_PROFILE_MASK;
4011
4012         write_seqlock(&fs_info->profiles_lock);
4013         if (flags & BTRFS_BLOCK_GROUP_DATA)
4014                 fs_info->avail_data_alloc_bits |= extra_flags;
4015         if (flags & BTRFS_BLOCK_GROUP_METADATA)
4016                 fs_info->avail_metadata_alloc_bits |= extra_flags;
4017         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4018                 fs_info->avail_system_alloc_bits |= extra_flags;
4019         write_sequnlock(&fs_info->profiles_lock);
4020 }
4021
4022 /*
4023  * returns target flags in extended format or 0 if restripe for this
4024  * chunk_type is not in progress
4025  *
4026  * should be called with either volume_mutex or balance_lock held
4027  */
4028 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4029 {
4030         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4031         u64 target = 0;
4032
4033         if (!bctl)
4034                 return 0;
4035
4036         if (flags & BTRFS_BLOCK_GROUP_DATA &&
4037             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4038                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4039         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4040                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4041                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4042         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4043                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4044                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4045         }
4046
4047         return target;
4048 }
4049
4050 /*
4051  * @flags: available profiles in extended format (see ctree.h)
4052  *
4053  * Returns reduced profile in chunk format.  If profile changing is in
4054  * progress (either running or paused) picks the target profile (if it's
4055  * already available), otherwise falls back to plain reducing.
4056  */
4057 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
4058 {
4059         u64 num_devices = root->fs_info->fs_devices->rw_devices;
4060         u64 target;
4061         u64 raid_type;
4062         u64 allowed = 0;
4063
4064         /*
4065          * see if restripe for this chunk_type is in progress, if so
4066          * try to reduce to the target profile
4067          */
4068         spin_lock(&root->fs_info->balance_lock);
4069         target = get_restripe_target(root->fs_info, flags);
4070         if (target) {
4071                 /* pick target profile only if it's already available */
4072                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4073                         spin_unlock(&root->fs_info->balance_lock);
4074                         return extended_to_chunk(target);
4075                 }
4076         }
4077         spin_unlock(&root->fs_info->balance_lock);
4078
4079         /* First, mask out the RAID levels which aren't possible */
4080         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4081                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4082                         allowed |= btrfs_raid_group[raid_type];
4083         }
4084         allowed &= flags;
4085
4086         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4087                 allowed = BTRFS_BLOCK_GROUP_RAID6;
4088         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4089                 allowed = BTRFS_BLOCK_GROUP_RAID5;
4090         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4091                 allowed = BTRFS_BLOCK_GROUP_RAID10;
4092         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4093                 allowed = BTRFS_BLOCK_GROUP_RAID1;
4094         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4095                 allowed = BTRFS_BLOCK_GROUP_RAID0;
4096
4097         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4098
4099         return extended_to_chunk(flags | allowed);
4100 }
4101
4102 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
4103 {
4104         unsigned seq;
4105         u64 flags;
4106
4107         do {
4108                 flags = orig_flags;
4109                 seq = read_seqbegin(&root->fs_info->profiles_lock);
4110
4111                 if (flags & BTRFS_BLOCK_GROUP_DATA)
4112                         flags |= root->fs_info->avail_data_alloc_bits;
4113                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4114                         flags |= root->fs_info->avail_system_alloc_bits;
4115                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4116                         flags |= root->fs_info->avail_metadata_alloc_bits;
4117         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
4118
4119         return btrfs_reduce_alloc_profile(root, flags);
4120 }
4121
4122 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
4123 {
4124         u64 flags;
4125         u64 ret;
4126
4127         if (data)
4128                 flags = BTRFS_BLOCK_GROUP_DATA;
4129         else if (root == root->fs_info->chunk_root)
4130                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
4131         else
4132                 flags = BTRFS_BLOCK_GROUP_METADATA;
4133
4134         ret = get_alloc_profile(root, flags);
4135         return ret;
4136 }
4137
4138 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
4139 {
4140         struct btrfs_space_info *data_sinfo;
4141         struct btrfs_root *root = BTRFS_I(inode)->root;
4142         struct btrfs_fs_info *fs_info = root->fs_info;
4143         u64 used;
4144         int ret = 0;
4145         int need_commit = 2;
4146         int have_pinned_space;
4147
4148         /* make sure bytes are sectorsize aligned */
4149         bytes = ALIGN(bytes, root->sectorsize);
4150
4151         if (btrfs_is_free_space_inode(inode)) {
4152                 need_commit = 0;
4153                 ASSERT(current->journal_info);
4154         }
4155
4156         data_sinfo = fs_info->data_sinfo;
4157         if (!data_sinfo)
4158                 goto alloc;
4159
4160 again:
4161         /* make sure we have enough space to handle the data first */
4162         spin_lock(&data_sinfo->lock);
4163         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
4164                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
4165                 data_sinfo->bytes_may_use;
4166
4167         if (used + bytes > data_sinfo->total_bytes) {
4168                 struct btrfs_trans_handle *trans;
4169
4170                 /*
4171                  * if we don't have enough free bytes in this space then we need
4172                  * to alloc a new chunk.
4173                  */
4174                 if (!data_sinfo->full) {
4175                         u64 alloc_target;
4176
4177                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4178                         spin_unlock(&data_sinfo->lock);
4179 alloc:
4180                         alloc_target = btrfs_get_alloc_profile(root, 1);
4181                         /*
4182                          * It is ugly that we don't call nolock join
4183                          * transaction for the free space inode case here.
4184                          * But it is safe because we only do the data space
4185                          * reservation for the free space cache in the
4186                          * transaction context, the common join transaction
4187                          * just increase the counter of the current transaction
4188                          * handler, doesn't try to acquire the trans_lock of
4189                          * the fs.
4190                          */
4191                         trans = btrfs_join_transaction(root);
4192                         if (IS_ERR(trans))
4193                                 return PTR_ERR(trans);
4194
4195                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4196                                              alloc_target,
4197                                              CHUNK_ALLOC_NO_FORCE);
4198                         btrfs_end_transaction(trans, root);
4199                         if (ret < 0) {
4200                                 if (ret != -ENOSPC)
4201                                         return ret;
4202                                 else {
4203                                         have_pinned_space = 1;
4204                                         goto commit_trans;
4205                                 }
4206                         }
4207
4208                         if (!data_sinfo)
4209                                 data_sinfo = fs_info->data_sinfo;
4210
4211                         goto again;
4212                 }
4213
4214                 /*
4215                  * If we don't have enough pinned space to deal with this
4216                  * allocation, and no removed chunk in current transaction,
4217                  * don't bother committing the transaction.
4218                  */
4219                 have_pinned_space = percpu_counter_compare(
4220                         &data_sinfo->total_bytes_pinned,
4221                         used + bytes - data_sinfo->total_bytes);
4222                 spin_unlock(&data_sinfo->lock);
4223
4224                 /* commit the current transaction and try again */
4225 commit_trans:
4226                 if (need_commit &&
4227                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
4228                         need_commit--;
4229
4230                         if (need_commit > 0) {
4231                                 btrfs_start_delalloc_roots(fs_info, 0, -1);
4232                                 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
4233                         }
4234
4235                         trans = btrfs_join_transaction(root);
4236                         if (IS_ERR(trans))
4237                                 return PTR_ERR(trans);
4238                         if (have_pinned_space >= 0 ||
4239                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4240                                      &trans->transaction->flags) ||
4241                             need_commit > 0) {
4242                                 ret = btrfs_commit_transaction(trans, root);
4243                                 if (ret)
4244                                         return ret;
4245                                 /*
4246                                  * The cleaner kthread might still be doing iput
4247                                  * operations. Wait for it to finish so that
4248                                  * more space is released.
4249                                  */
4250                                 mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
4251                                 mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
4252                                 goto again;
4253                         } else {
4254                                 btrfs_end_transaction(trans, root);
4255                         }
4256                 }
4257
4258                 trace_btrfs_space_reservation(root->fs_info,
4259                                               "space_info:enospc",
4260                                               data_sinfo->flags, bytes, 1);
4261                 return -ENOSPC;
4262         }
4263         data_sinfo->bytes_may_use += bytes;
4264         trace_btrfs_space_reservation(root->fs_info, "space_info",
4265                                       data_sinfo->flags, bytes, 1);
4266         spin_unlock(&data_sinfo->lock);
4267
4268         return ret;
4269 }
4270
4271 /*
4272  * New check_data_free_space() with ability for precious data reservation
4273  * Will replace old btrfs_check_data_free_space(), but for patch split,
4274  * add a new function first and then replace it.
4275  */
4276 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4277 {
4278         struct btrfs_root *root = BTRFS_I(inode)->root;
4279         int ret;
4280
4281         /* align the range */
4282         len = round_up(start + len, root->sectorsize) -
4283               round_down(start, root->sectorsize);
4284         start = round_down(start, root->sectorsize);
4285
4286         ret = btrfs_alloc_data_chunk_ondemand(inode, len);
4287         if (ret < 0)
4288                 return ret;
4289
4290         /*
4291          * Use new btrfs_qgroup_reserve_data to reserve precious data space
4292          *
4293          * TODO: Find a good method to avoid reserve data space for NOCOW
4294          * range, but don't impact performance on quota disable case.
4295          */
4296         ret = btrfs_qgroup_reserve_data(inode, start, len);
4297         return ret;
4298 }
4299
4300 /*
4301  * Called if we need to clear a data reservation for this inode
4302  * Normally in a error case.
4303  *
4304  * This one will *NOT* use accurate qgroup reserved space API, just for case
4305  * which we can't sleep and is sure it won't affect qgroup reserved space.
4306  * Like clear_bit_hook().
4307  */
4308 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4309                                             u64 len)
4310 {
4311         struct btrfs_root *root = BTRFS_I(inode)->root;
4312         struct btrfs_space_info *data_sinfo;
4313
4314         /* Make sure the range is aligned to sectorsize */
4315         len = round_up(start + len, root->sectorsize) -
4316               round_down(start, root->sectorsize);
4317         start = round_down(start, root->sectorsize);
4318
4319         data_sinfo = root->fs_info->data_sinfo;
4320         spin_lock(&data_sinfo->lock);
4321         if (WARN_ON(data_sinfo->bytes_may_use < len))
4322                 data_sinfo->bytes_may_use = 0;
4323         else
4324                 data_sinfo->bytes_may_use -= len;
4325         trace_btrfs_space_reservation(root->fs_info, "space_info",
4326                                       data_sinfo->flags, len, 0);
4327         spin_unlock(&data_sinfo->lock);
4328 }
4329
4330 /*
4331  * Called if we need to clear a data reservation for this inode
4332  * Normally in a error case.
4333  *
4334  * This one will handle the per-inode data rsv map for accurate reserved
4335  * space framework.
4336  */
4337 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4338 {
4339         btrfs_free_reserved_data_space_noquota(inode, start, len);
4340         btrfs_qgroup_free_data(inode, start, len);
4341 }
4342
4343 static void force_metadata_allocation(struct btrfs_fs_info *info)
4344 {
4345         struct list_head *head = &info->space_info;
4346         struct btrfs_space_info *found;
4347
4348         rcu_read_lock();
4349         list_for_each_entry_rcu(found, head, list) {
4350                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4351                         found->force_alloc = CHUNK_ALLOC_FORCE;
4352         }
4353         rcu_read_unlock();
4354 }
4355
4356 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4357 {
4358         return (global->size << 1);
4359 }
4360
4361 static int should_alloc_chunk(struct btrfs_root *root,
4362                               struct btrfs_space_info *sinfo, int force)
4363 {
4364         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4365         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4366         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4367         u64 thresh;
4368
4369         if (force == CHUNK_ALLOC_FORCE)
4370                 return 1;
4371
4372         /*
4373          * We need to take into account the global rsv because for all intents
4374          * and purposes it's used space.  Don't worry about locking the
4375          * global_rsv, it doesn't change except when the transaction commits.
4376          */
4377         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4378                 num_allocated += calc_global_rsv_need_space(global_rsv);
4379
4380         /*
4381          * in limited mode, we want to have some free space up to
4382          * about 1% of the FS size.
4383          */
4384         if (force == CHUNK_ALLOC_LIMITED) {
4385                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4386                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4387
4388                 if (num_bytes - num_allocated < thresh)
4389                         return 1;
4390         }
4391
4392         if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
4393                 return 0;
4394         return 1;
4395 }
4396
4397 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4398 {
4399         u64 num_dev;
4400
4401         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4402                     BTRFS_BLOCK_GROUP_RAID0 |
4403                     BTRFS_BLOCK_GROUP_RAID5 |
4404                     BTRFS_BLOCK_GROUP_RAID6))
4405                 num_dev = root->fs_info->fs_devices->rw_devices;
4406         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4407                 num_dev = 2;
4408         else
4409                 num_dev = 1;    /* DUP or single */
4410
4411         return num_dev;
4412 }
4413
4414 /*
4415  * If @is_allocation is true, reserve space in the system space info necessary
4416  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4417  * removing a chunk.
4418  */
4419 void check_system_chunk(struct btrfs_trans_handle *trans,
4420                         struct btrfs_root *root,
4421                         u64 type)
4422 {
4423         struct btrfs_space_info *info;
4424         u64 left;
4425         u64 thresh;
4426         int ret = 0;
4427         u64 num_devs;
4428
4429         /*
4430          * Needed because we can end up allocating a system chunk and for an
4431          * atomic and race free space reservation in the chunk block reserve.
4432          */
4433         ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4434
4435         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4436         spin_lock(&info->lock);
4437         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4438                 info->bytes_reserved - info->bytes_readonly -
4439                 info->bytes_may_use;
4440         spin_unlock(&info->lock);
4441
4442         num_devs = get_profile_num_devs(root, type);
4443
4444         /* num_devs device items to update and 1 chunk item to add or remove */
4445         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4446                 btrfs_calc_trans_metadata_size(root, 1);
4447
4448         if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
4449                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4450                         left, thresh, type);
4451                 dump_space_info(info, 0, 0);
4452         }
4453
4454         if (left < thresh) {
4455                 u64 flags;
4456
4457                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4458                 /*
4459                  * Ignore failure to create system chunk. We might end up not
4460                  * needing it, as we might not need to COW all nodes/leafs from
4461                  * the paths we visit in the chunk tree (they were already COWed
4462                  * or created in the current transaction for example).
4463                  */
4464                 ret = btrfs_alloc_chunk(trans, root, flags);
4465         }
4466
4467         if (!ret) {
4468                 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4469                                           &root->fs_info->chunk_block_rsv,
4470                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4471                 if (!ret)
4472                         trans->chunk_bytes_reserved += thresh;
4473         }
4474 }
4475
4476 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4477                           struct btrfs_root *extent_root, u64 flags, int force)
4478 {
4479         struct btrfs_space_info *space_info;
4480         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4481         int wait_for_alloc = 0;
4482         int ret = 0;
4483
4484         /* Don't re-enter if we're already allocating a chunk */
4485         if (trans->allocating_chunk)
4486                 return -ENOSPC;
4487
4488         space_info = __find_space_info(extent_root->fs_info, flags);
4489         if (!space_info) {
4490                 ret = update_space_info(extent_root->fs_info, flags,
4491                                         0, 0, 0, &space_info);
4492                 BUG_ON(ret); /* -ENOMEM */
4493         }
4494         BUG_ON(!space_info); /* Logic error */
4495
4496 again:
4497         spin_lock(&space_info->lock);
4498         if (force < space_info->force_alloc)
4499                 force = space_info->force_alloc;
4500         if (space_info->full) {
4501                 if (should_alloc_chunk(extent_root, space_info, force))
4502                         ret = -ENOSPC;
4503                 else
4504                         ret = 0;
4505                 spin_unlock(&space_info->lock);
4506                 return ret;
4507         }
4508
4509         if (!should_alloc_chunk(extent_root, space_info, force)) {
4510                 spin_unlock(&space_info->lock);
4511                 return 0;
4512         } else if (space_info->chunk_alloc) {
4513                 wait_for_alloc = 1;
4514         } else {
4515                 space_info->chunk_alloc = 1;
4516         }
4517
4518         spin_unlock(&space_info->lock);
4519
4520         mutex_lock(&fs_info->chunk_mutex);
4521
4522         /*
4523          * The chunk_mutex is held throughout the entirety of a chunk
4524          * allocation, so once we've acquired the chunk_mutex we know that the
4525          * other guy is done and we need to recheck and see if we should
4526          * allocate.
4527          */
4528         if (wait_for_alloc) {
4529                 mutex_unlock(&fs_info->chunk_mutex);
4530                 wait_for_alloc = 0;
4531                 goto again;
4532         }
4533
4534         trans->allocating_chunk = true;
4535
4536         /*
4537          * If we have mixed data/metadata chunks we want to make sure we keep
4538          * allocating mixed chunks instead of individual chunks.
4539          */
4540         if (btrfs_mixed_space_info(space_info))
4541                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4542
4543         /*
4544          * if we're doing a data chunk, go ahead and make sure that
4545          * we keep a reasonable number of metadata chunks allocated in the
4546          * FS as well.
4547          */
4548         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4549                 fs_info->data_chunk_allocations++;
4550                 if (!(fs_info->data_chunk_allocations %
4551                       fs_info->metadata_ratio))
4552                         force_metadata_allocation(fs_info);
4553         }
4554
4555         /*
4556          * Check if we have enough space in SYSTEM chunk because we may need
4557          * to update devices.
4558          */
4559         check_system_chunk(trans, extent_root, flags);
4560
4561         ret = btrfs_alloc_chunk(trans, extent_root, flags);
4562         trans->allocating_chunk = false;
4563
4564         spin_lock(&space_info->lock);
4565         if (ret < 0 && ret != -ENOSPC)
4566                 goto out;
4567         if (ret)
4568                 space_info->full = 1;
4569         else
4570                 ret = 1;
4571
4572         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4573 out:
4574         space_info->chunk_alloc = 0;
4575         spin_unlock(&space_info->lock);
4576         mutex_unlock(&fs_info->chunk_mutex);
4577         /*
4578          * When we allocate a new chunk we reserve space in the chunk block
4579          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4580          * add new nodes/leafs to it if we end up needing to do it when
4581          * inserting the chunk item and updating device items as part of the
4582          * second phase of chunk allocation, performed by
4583          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4584          * large number of new block groups to create in our transaction
4585          * handle's new_bgs list to avoid exhausting the chunk block reserve
4586          * in extreme cases - like having a single transaction create many new
4587          * block groups when starting to write out the free space caches of all
4588          * the block groups that were made dirty during the lifetime of the
4589          * transaction.
4590          */
4591         if (trans->can_flush_pending_bgs &&
4592             trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4593                 btrfs_create_pending_block_groups(trans, extent_root);
4594                 btrfs_trans_release_chunk_metadata(trans);
4595         }
4596         return ret;
4597 }
4598
4599 static int can_overcommit(struct btrfs_root *root,
4600                           struct btrfs_space_info *space_info, u64 bytes,
4601                           enum btrfs_reserve_flush_enum flush)
4602 {
4603         struct btrfs_block_rsv *global_rsv;
4604         u64 profile;
4605         u64 space_size;
4606         u64 avail;
4607         u64 used;
4608
4609         /* Don't overcommit when in mixed mode. */
4610         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4611                 return 0;
4612
4613         BUG_ON(root->fs_info == NULL);
4614         global_rsv = &root->fs_info->global_block_rsv;
4615         profile = btrfs_get_alloc_profile(root, 0);
4616         used = space_info->bytes_used + space_info->bytes_reserved +
4617                 space_info->bytes_pinned + space_info->bytes_readonly;
4618
4619         /*
4620          * We only want to allow over committing if we have lots of actual space
4621          * free, but if we don't have enough space to handle the global reserve
4622          * space then we could end up having a real enospc problem when trying
4623          * to allocate a chunk or some other such important allocation.
4624          */
4625         spin_lock(&global_rsv->lock);
4626         space_size = calc_global_rsv_need_space(global_rsv);
4627         spin_unlock(&global_rsv->lock);
4628         if (used + space_size >= space_info->total_bytes)
4629                 return 0;
4630
4631         used += space_info->bytes_may_use;
4632
4633         spin_lock(&root->fs_info->free_chunk_lock);
4634         avail = root->fs_info->free_chunk_space;
4635         spin_unlock(&root->fs_info->free_chunk_lock);
4636
4637         /*
4638          * If we have dup, raid1 or raid10 then only half of the free
4639          * space is actually useable.  For raid56, the space info used
4640          * doesn't include the parity drive, so we don't have to
4641          * change the math
4642          */
4643         if (profile & (BTRFS_BLOCK_GROUP_DUP |
4644                        BTRFS_BLOCK_GROUP_RAID1 |
4645                        BTRFS_BLOCK_GROUP_RAID10))
4646                 avail >>= 1;
4647
4648         /*
4649          * If we aren't flushing all things, let us overcommit up to
4650          * 1/2th of the space. If we can flush, don't let us overcommit
4651          * too much, let it overcommit up to 1/8 of the space.
4652          */
4653         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4654                 avail >>= 3;
4655         else
4656                 avail >>= 1;
4657
4658         if (used + bytes < space_info->total_bytes + avail)
4659                 return 1;
4660         return 0;
4661 }
4662
4663 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4664                                          unsigned long nr_pages, int nr_items)
4665 {
4666         struct super_block *sb = root->fs_info->sb;
4667
4668         if (down_read_trylock(&sb->s_umount)) {
4669                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4670                 up_read(&sb->s_umount);
4671         } else {
4672                 /*
4673                  * We needn't worry the filesystem going from r/w to r/o though
4674                  * we don't acquire ->s_umount mutex, because the filesystem
4675                  * should guarantee the delalloc inodes list be empty after
4676                  * the filesystem is readonly(all dirty pages are written to
4677                  * the disk).
4678                  */
4679                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4680                 if (!current->journal_info)
4681                         btrfs_wait_ordered_roots(root->fs_info, nr_items,
4682                                                  0, (u64)-1);
4683         }
4684 }
4685
4686 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4687 {
4688         u64 bytes;
4689         int nr;
4690
4691         bytes = btrfs_calc_trans_metadata_size(root, 1);
4692         nr = (int)div64_u64(to_reclaim, bytes);
4693         if (!nr)
4694                 nr = 1;
4695         return nr;
4696 }
4697
4698 #define EXTENT_SIZE_PER_ITEM    SZ_256K
4699
4700 /*
4701  * shrink metadata reservation for delalloc
4702  */
4703 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4704                             bool wait_ordered)
4705 {
4706         struct btrfs_block_rsv *block_rsv;
4707         struct btrfs_space_info *space_info;
4708         struct btrfs_trans_handle *trans;
4709         u64 delalloc_bytes;
4710         u64 max_reclaim;
4711         long time_left;
4712         unsigned long nr_pages;
4713         int loops;
4714         int items;
4715         enum btrfs_reserve_flush_enum flush;
4716
4717         /* Calc the number of the pages we need flush for space reservation */
4718         items = calc_reclaim_items_nr(root, to_reclaim);
4719         to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
4720
4721         trans = (struct btrfs_trans_handle *)current->journal_info;
4722         block_rsv = &root->fs_info->delalloc_block_rsv;
4723         space_info = block_rsv->space_info;
4724
4725         delalloc_bytes = percpu_counter_sum_positive(
4726                                                 &root->fs_info->delalloc_bytes);
4727         if (delalloc_bytes == 0) {
4728                 if (trans)
4729                         return;
4730                 if (wait_ordered)
4731                         btrfs_wait_ordered_roots(root->fs_info, items,
4732                                                  0, (u64)-1);
4733                 return;
4734         }
4735
4736         loops = 0;
4737         while (delalloc_bytes && loops < 3) {
4738                 max_reclaim = min(delalloc_bytes, to_reclaim);
4739                 nr_pages = max_reclaim >> PAGE_SHIFT;
4740                 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4741                 /*
4742                  * We need to wait for the async pages to actually start before
4743                  * we do anything.
4744                  */
4745                 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4746                 if (!max_reclaim)
4747                         goto skip_async;
4748
4749                 if (max_reclaim <= nr_pages)
4750                         max_reclaim = 0;
4751                 else
4752                         max_reclaim -= nr_pages;
4753
4754                 wait_event(root->fs_info->async_submit_wait,
4755                            atomic_read(&root->fs_info->async_delalloc_pages) <=
4756                            (int)max_reclaim);
4757 skip_async:
4758                 if (!trans)
4759                         flush = BTRFS_RESERVE_FLUSH_ALL;
4760                 else
4761                         flush = BTRFS_RESERVE_NO_FLUSH;
4762                 spin_lock(&space_info->lock);
4763                 if (can_overcommit(root, space_info, orig, flush)) {
4764                         spin_unlock(&space_info->lock);
4765                         break;
4766                 }
4767                 if (list_empty(&space_info->tickets) &&
4768                     list_empty(&space_info->priority_tickets)) {
4769                         spin_unlock(&space_info->lock);
4770                         break;
4771                 }
4772                 spin_unlock(&space_info->lock);
4773
4774                 loops++;
4775                 if (wait_ordered && !trans) {
4776                         btrfs_wait_ordered_roots(root->fs_info, items,
4777                                                  0, (u64)-1);
4778                 } else {
4779                         time_left = schedule_timeout_killable(1);
4780                         if (time_left)
4781                                 break;
4782                 }
4783                 delalloc_bytes = percpu_counter_sum_positive(
4784                                                 &root->fs_info->delalloc_bytes);
4785         }
4786 }
4787
4788 /**
4789  * maybe_commit_transaction - possibly commit the transaction if its ok to
4790  * @root - the root we're allocating for
4791  * @bytes - the number of bytes we want to reserve
4792  * @force - force the commit
4793  *
4794  * This will check to make sure that committing the transaction will actually
4795  * get us somewhere and then commit the transaction if it does.  Otherwise it
4796  * will return -ENOSPC.
4797  */
4798 static int may_commit_transaction(struct btrfs_root *root,
4799                                   struct btrfs_space_info *space_info,
4800                                   u64 bytes, int force)
4801 {
4802         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4803         struct btrfs_trans_handle *trans;
4804
4805         trans = (struct btrfs_trans_handle *)current->journal_info;
4806         if (trans)
4807                 return -EAGAIN;
4808
4809         if (force)
4810                 goto commit;
4811
4812         /* See if there is enough pinned space to make this reservation */
4813         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4814                                    bytes) >= 0)
4815                 goto commit;
4816
4817         /*
4818          * See if there is some space in the delayed insertion reservation for
4819          * this reservation.
4820          */
4821         if (space_info != delayed_rsv->space_info)
4822                 return -ENOSPC;
4823
4824         spin_lock(&delayed_rsv->lock);
4825         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4826                                    bytes - delayed_rsv->size) >= 0) {
4827                 spin_unlock(&delayed_rsv->lock);
4828                 return -ENOSPC;
4829         }
4830         spin_unlock(&delayed_rsv->lock);
4831
4832 commit:
4833         trans = btrfs_join_transaction(root);
4834         if (IS_ERR(trans))
4835                 return -ENOSPC;
4836
4837         return btrfs_commit_transaction(trans, root);
4838 }
4839
4840 struct reserve_ticket {
4841         u64 bytes;
4842         int error;
4843         struct list_head list;
4844         wait_queue_head_t wait;
4845 };
4846
4847 static int flush_space(struct btrfs_root *root,
4848                        struct btrfs_space_info *space_info, u64 num_bytes,
4849                        u64 orig_bytes, int state)
4850 {
4851         struct btrfs_trans_handle *trans;
4852         int nr;
4853         int ret = 0;
4854
4855         switch (state) {
4856         case FLUSH_DELAYED_ITEMS_NR:
4857         case FLUSH_DELAYED_ITEMS:
4858                 if (state == FLUSH_DELAYED_ITEMS_NR)
4859                         nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4860                 else
4861                         nr = -1;
4862
4863                 trans = btrfs_join_transaction(root);
4864                 if (IS_ERR(trans)) {
4865                         ret = PTR_ERR(trans);
4866                         break;
4867                 }
4868                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4869                 btrfs_end_transaction(trans, root);
4870                 break;
4871         case FLUSH_DELALLOC:
4872         case FLUSH_DELALLOC_WAIT:
4873                 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4874                                 state == FLUSH_DELALLOC_WAIT);
4875                 break;
4876         case ALLOC_CHUNK:
4877                 trans = btrfs_join_transaction(root);
4878                 if (IS_ERR(trans)) {
4879                         ret = PTR_ERR(trans);
4880                         break;
4881                 }
4882                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4883                                      btrfs_get_alloc_profile(root, 0),
4884                                      CHUNK_ALLOC_NO_FORCE);
4885                 btrfs_end_transaction(trans, root);
4886                 if (ret > 0 || ret == -ENOSPC)
4887                         ret = 0;
4888                 break;
4889         case COMMIT_TRANS:
4890                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4891                 break;
4892         default:
4893                 ret = -ENOSPC;
4894                 break;
4895         }
4896
4897         trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
4898                                 orig_bytes, state, ret);
4899         return ret;
4900 }
4901
4902 static inline u64
4903 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4904                                  struct btrfs_space_info *space_info)
4905 {
4906         struct reserve_ticket *ticket;
4907         u64 used;
4908         u64 expected;
4909         u64 to_reclaim = 0;
4910
4911         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4912         if (can_overcommit(root, space_info, to_reclaim,
4913                            BTRFS_RESERVE_FLUSH_ALL))
4914                 return 0;
4915
4916         list_for_each_entry(ticket, &space_info->tickets, list)
4917                 to_reclaim += ticket->bytes;
4918         list_for_each_entry(ticket, &space_info->priority_tickets, list)
4919                 to_reclaim += ticket->bytes;
4920         if (to_reclaim)
4921                 return to_reclaim;
4922
4923         used = space_info->bytes_used + space_info->bytes_reserved +
4924                space_info->bytes_pinned + space_info->bytes_readonly +
4925                space_info->bytes_may_use;
4926         if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
4927                 expected = div_factor_fine(space_info->total_bytes, 95);
4928         else
4929                 expected = div_factor_fine(space_info->total_bytes, 90);
4930
4931         if (used > expected)
4932                 to_reclaim = used - expected;
4933         else
4934                 to_reclaim = 0;
4935         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4936                                      space_info->bytes_reserved);
4937         return to_reclaim;
4938 }
4939
4940 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4941                                         struct btrfs_root *root, u64 used)
4942 {
4943         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4944
4945         /* If we're just plain full then async reclaim just slows us down. */
4946         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4947                 return 0;
4948
4949         if (!btrfs_calc_reclaim_metadata_size(root, space_info))
4950                 return 0;
4951
4952         return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
4953                 !test_bit(BTRFS_FS_STATE_REMOUNTING,
4954                           &root->fs_info->fs_state));
4955 }
4956
4957 static void wake_all_tickets(struct list_head *head)
4958 {
4959         struct reserve_ticket *ticket;
4960
4961         while (!list_empty(head)) {
4962                 ticket = list_first_entry(head, struct reserve_ticket, list);
4963                 list_del_init(&ticket->list);
4964                 ticket->error = -ENOSPC;
4965                 wake_up(&ticket->wait);
4966         }
4967 }
4968
4969 /*
4970  * This is for normal flushers, we can wait all goddamned day if we want to.  We
4971  * will loop and continuously try to flush as long as we are making progress.
4972  * We count progress as clearing off tickets each time we have to loop.
4973  */
4974 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4975 {
4976         struct reserve_ticket *last_ticket = NULL;
4977         struct btrfs_fs_info *fs_info;
4978         struct btrfs_space_info *space_info;
4979         u64 to_reclaim;
4980         int flush_state;
4981         int commit_cycles = 0;
4982
4983         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4984         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4985
4986         spin_lock(&space_info->lock);
4987         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4988                                                       space_info);
4989         if (!to_reclaim) {
4990                 space_info->flush = 0;
4991                 spin_unlock(&space_info->lock);
4992                 return;
4993         }
4994         last_ticket = list_first_entry(&space_info->tickets,
4995                                        struct reserve_ticket, list);
4996         spin_unlock(&space_info->lock);
4997
4998         flush_state = FLUSH_DELAYED_ITEMS_NR;
4999         do {
5000                 struct reserve_ticket *ticket;
5001                 int ret;
5002
5003                 ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
5004                             to_reclaim, flush_state);
5005                 spin_lock(&space_info->lock);
5006                 if (list_empty(&space_info->tickets)) {
5007                         space_info->flush = 0;
5008                         spin_unlock(&space_info->lock);
5009                         return;
5010                 }
5011                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5012                                                               space_info);
5013                 ticket = list_first_entry(&space_info->tickets,
5014                                           struct reserve_ticket, list);
5015                 if (last_ticket == ticket) {
5016                         flush_state++;
5017                 } else {
5018                         last_ticket = ticket;
5019                         flush_state = FLUSH_DELAYED_ITEMS_NR;
5020                         if (commit_cycles)
5021                                 commit_cycles--;
5022                 }
5023
5024                 if (flush_state > COMMIT_TRANS) {
5025                         commit_cycles++;
5026                         if (commit_cycles > 2) {
5027                                 wake_all_tickets(&space_info->tickets);
5028                                 space_info->flush = 0;
5029                         } else {
5030                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
5031                         }
5032                 }
5033                 spin_unlock(&space_info->lock);
5034         } while (flush_state <= COMMIT_TRANS);
5035 }
5036
5037 void btrfs_init_async_reclaim_work(struct work_struct *work)
5038 {
5039         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5040 }
5041
5042 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5043                                             struct btrfs_space_info *space_info,
5044                                             struct reserve_ticket *ticket)
5045 {
5046         u64 to_reclaim;
5047         int flush_state = FLUSH_DELAYED_ITEMS_NR;
5048
5049         spin_lock(&space_info->lock);
5050         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5051                                                       space_info);
5052         if (!to_reclaim) {
5053                 spin_unlock(&space_info->lock);
5054                 return;
5055         }
5056         spin_unlock(&space_info->lock);
5057
5058         do {
5059                 flush_space(fs_info->fs_root, space_info, to_reclaim,
5060                             to_reclaim, flush_state);
5061                 flush_state++;
5062                 spin_lock(&space_info->lock);
5063                 if (ticket->bytes == 0) {
5064                         spin_unlock(&space_info->lock);
5065                         return;
5066                 }
5067                 spin_unlock(&space_info->lock);
5068
5069                 /*
5070                  * Priority flushers can't wait on delalloc without
5071                  * deadlocking.
5072                  */
5073                 if (flush_state == FLUSH_DELALLOC ||
5074                     flush_state == FLUSH_DELALLOC_WAIT)
5075                         flush_state = ALLOC_CHUNK;
5076         } while (flush_state < COMMIT_TRANS);
5077 }
5078
5079 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5080                                struct btrfs_space_info *space_info,
5081                                struct reserve_ticket *ticket, u64 orig_bytes)
5082
5083 {
5084         DEFINE_WAIT(wait);
5085         int ret = 0;
5086
5087         spin_lock(&space_info->lock);
5088         while (ticket->bytes > 0 && ticket->error == 0) {
5089                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5090                 if (ret) {
5091                         ret = -EINTR;
5092                         break;
5093                 }
5094                 spin_unlock(&space_info->lock);
5095
5096                 schedule();
5097
5098                 finish_wait(&ticket->wait, &wait);
5099                 spin_lock(&space_info->lock);
5100         }
5101         if (!ret)
5102                 ret = ticket->error;
5103         if (!list_empty(&ticket->list))
5104                 list_del_init(&ticket->list);
5105         if (ticket->bytes && ticket->bytes < orig_bytes) {
5106                 u64 num_bytes = orig_bytes - ticket->bytes;
5107                 space_info->bytes_may_use -= num_bytes;
5108                 trace_btrfs_space_reservation(fs_info, "space_info",
5109                                               space_info->flags, num_bytes, 0);
5110         }
5111         spin_unlock(&space_info->lock);
5112
5113         return ret;
5114 }
5115
5116 /**
5117  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5118  * @root - the root we're allocating for
5119  * @space_info - the space info we want to allocate from
5120  * @orig_bytes - the number of bytes we want
5121  * @flush - whether or not we can flush to make our reservation
5122  *
5123  * This will reserve orig_bytes number of bytes from the space info associated
5124  * with the block_rsv.  If there is not enough space it will make an attempt to
5125  * flush out space to make room.  It will do this by flushing delalloc if
5126  * possible or committing the transaction.  If flush is 0 then no attempts to
5127  * regain reservations will be made and this will fail if there is not enough
5128  * space already.
5129  */
5130 static int __reserve_metadata_bytes(struct btrfs_root *root,
5131                                     struct btrfs_space_info *space_info,
5132                                     u64 orig_bytes,
5133                                     enum btrfs_reserve_flush_enum flush)
5134 {
5135         struct reserve_ticket ticket;
5136         u64 used;
5137         int ret = 0;
5138
5139         ASSERT(orig_bytes);
5140         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5141
5142         spin_lock(&space_info->lock);
5143         ret = -ENOSPC;
5144         used = space_info->bytes_used + space_info->bytes_reserved +
5145                 space_info->bytes_pinned + space_info->bytes_readonly +
5146                 space_info->bytes_may_use;
5147
5148         /*
5149          * If we have enough space then hooray, make our reservation and carry
5150          * on.  If not see if we can overcommit, and if we can, hooray carry on.
5151          * If not things get more complicated.
5152          */
5153         if (used + orig_bytes <= space_info->total_bytes) {
5154                 space_info->bytes_may_use += orig_bytes;
5155                 trace_btrfs_space_reservation(root->fs_info, "space_info",
5156                                               space_info->flags, orig_bytes,
5157                                               1);
5158                 ret = 0;
5159         } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
5160                 space_info->bytes_may_use += orig_bytes;
5161                 trace_btrfs_space_reservation(root->fs_info, "space_info",
5162                                               space_info->flags, orig_bytes,
5163                                               1);
5164                 ret = 0;
5165         }
5166
5167         /*
5168          * If we couldn't make a reservation then setup our reservation ticket
5169          * and kick the async worker if it's not already running.
5170          *
5171          * If we are a priority flusher then we just need to add our ticket to
5172          * the list and we will do our own flushing further down.
5173          */
5174         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5175                 ticket.bytes = orig_bytes;
5176                 ticket.error = 0;
5177                 init_waitqueue_head(&ticket.wait);
5178                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5179                         list_add_tail(&ticket.list, &space_info->tickets);
5180                         if (!space_info->flush) {
5181                                 space_info->flush = 1;
5182                                 trace_btrfs_trigger_flush(root->fs_info,
5183                                                           space_info->flags,
5184                                                           orig_bytes, flush,
5185                                                           "enospc");
5186                                 queue_work(system_unbound_wq,
5187                                            &root->fs_info->async_reclaim_work);
5188                         }
5189                 } else {
5190                         list_add_tail(&ticket.list,
5191                                       &space_info->priority_tickets);
5192                 }
5193         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5194                 used += orig_bytes;
5195                 /*
5196                  * We will do the space reservation dance during log replay,
5197                  * which means we won't have fs_info->fs_root set, so don't do
5198                  * the async reclaim as we will panic.
5199                  */
5200                 if (!root->fs_info->log_root_recovering &&
5201                     need_do_async_reclaim(space_info, root, used) &&
5202                     !work_busy(&root->fs_info->async_reclaim_work)) {
5203                         trace_btrfs_trigger_flush(root->fs_info,
5204                                                   space_info->flags,
5205                                                   orig_bytes, flush,
5206                                                   "preempt");
5207                         queue_work(system_unbound_wq,
5208                                    &root->fs_info->async_reclaim_work);
5209                 }
5210         }
5211         spin_unlock(&space_info->lock);
5212         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5213                 return ret;
5214
5215         if (flush == BTRFS_RESERVE_FLUSH_ALL)
5216                 return wait_reserve_ticket(root->fs_info, space_info, &ticket,
5217                                            orig_bytes);
5218
5219         ret = 0;
5220         priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
5221         spin_lock(&space_info->lock);
5222         if (ticket.bytes) {
5223                 if (ticket.bytes < orig_bytes) {
5224                         u64 num_bytes = orig_bytes - ticket.bytes;
5225                         space_info->bytes_may_use -= num_bytes;
5226                         trace_btrfs_space_reservation(root->fs_info,
5227                                         "space_info", space_info->flags,
5228                                         num_bytes, 0);
5229
5230                 }
5231                 list_del_init(&ticket.list);
5232                 ret = -ENOSPC;
5233         }
5234         spin_unlock(&space_info->lock);
5235         ASSERT(list_empty(&ticket.list));
5236         return ret;
5237 }
5238
5239 /**
5240  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5241  * @root - the root we're allocating for
5242  * @block_rsv - the block_rsv we're allocating for
5243  * @orig_bytes - the number of bytes we want
5244  * @flush - whether or not we can flush to make our reservation
5245  *
5246  * This will reserve orgi_bytes number of bytes from the space info associated
5247  * with the block_rsv.  If there is not enough space it will make an attempt to
5248  * flush out space to make room.  It will do this by flushing delalloc if
5249  * possible or committing the transaction.  If flush is 0 then no attempts to
5250  * regain reservations will be made and this will fail if there is not enough
5251  * space already.
5252  */
5253 static int reserve_metadata_bytes(struct btrfs_root *root,
5254                                   struct btrfs_block_rsv *block_rsv,
5255                                   u64 orig_bytes,
5256                                   enum btrfs_reserve_flush_enum flush)
5257 {
5258         int ret;
5259
5260         ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
5261                                        flush);
5262         if (ret == -ENOSPC &&
5263             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5264                 struct btrfs_block_rsv *global_rsv =
5265                         &root->fs_info->global_block_rsv;
5266
5267                 if (block_rsv != global_rsv &&
5268                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5269                         ret = 0;
5270         }
5271         if (ret == -ENOSPC)
5272                 trace_btrfs_space_reservation(root->fs_info,
5273                                               "space_info:enospc",
5274                                               block_rsv->space_info->flags,
5275                                               orig_bytes, 1);
5276         return ret;
5277 }
5278
5279 static struct btrfs_block_rsv *get_block_rsv(
5280                                         const struct btrfs_trans_handle *trans,
5281                                         const struct btrfs_root *root)
5282 {
5283         struct btrfs_block_rsv *block_rsv = NULL;
5284
5285         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5286             (root == root->fs_info->csum_root && trans->adding_csums) ||
5287              (root == root->fs_info->uuid_root))
5288                 block_rsv = trans->block_rsv;
5289
5290         if (!block_rsv)
5291                 block_rsv = root->block_rsv;
5292
5293         if (!block_rsv)
5294                 block_rsv = &root->fs_info->empty_block_rsv;
5295
5296         return block_rsv;
5297 }
5298
5299 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5300                                u64 num_bytes)
5301 {
5302         int ret = -ENOSPC;
5303         spin_lock(&block_rsv->lock);
5304         if (block_rsv->reserved >= num_bytes) {
5305                 block_rsv->reserved -= num_bytes;
5306                 if (block_rsv->reserved < block_rsv->size)
5307                         block_rsv->full = 0;
5308                 ret = 0;
5309         }
5310         spin_unlock(&block_rsv->lock);
5311         return ret;
5312 }
5313
5314 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5315                                 u64 num_bytes, int update_size)
5316 {
5317         spin_lock(&block_rsv->lock);
5318         block_rsv->reserved += num_bytes;
5319         if (update_size)
5320                 block_rsv->size += num_bytes;
5321         else if (block_rsv->reserved >= block_rsv->size)
5322                 block_rsv->full = 1;
5323         spin_unlock(&block_rsv->lock);
5324 }
5325
5326 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5327                              struct btrfs_block_rsv *dest, u64 num_bytes,
5328                              int min_factor)
5329 {
5330         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5331         u64 min_bytes;
5332
5333         if (global_rsv->space_info != dest->space_info)
5334                 return -ENOSPC;
5335
5336         spin_lock(&global_rsv->lock);
5337         min_bytes = div_factor(global_rsv->size, min_factor);
5338         if (global_rsv->reserved < min_bytes + num_bytes) {
5339                 spin_unlock(&global_rsv->lock);
5340                 return -ENOSPC;
5341         }
5342         global_rsv->reserved -= num_bytes;
5343         if (global_rsv->reserved < global_rsv->size)
5344                 global_rsv->full = 0;
5345         spin_unlock(&global_rsv->lock);
5346
5347         block_rsv_add_bytes(dest, num_bytes, 1);
5348         return 0;
5349 }
5350
5351 /*
5352  * This is for space we already have accounted in space_info->bytes_may_use, so
5353  * basically when we're returning space from block_rsv's.
5354  */
5355 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5356                                      struct btrfs_space_info *space_info,
5357                                      u64 num_bytes)
5358 {
5359         struct reserve_ticket *ticket;
5360         struct list_head *head;
5361         u64 used;
5362         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5363         bool check_overcommit = false;
5364
5365         spin_lock(&space_info->lock);
5366         head = &space_info->priority_tickets;
5367
5368         /*
5369          * If we are over our limit then we need to check and see if we can
5370          * overcommit, and if we can't then we just need to free up our space
5371          * and not satisfy any requests.
5372          */
5373         used = space_info->bytes_used + space_info->bytes_reserved +
5374                 space_info->bytes_pinned + space_info->bytes_readonly +
5375                 space_info->bytes_may_use;
5376         if (used - num_bytes >= space_info->total_bytes)
5377                 check_overcommit = true;
5378 again:
5379         while (!list_empty(head) && num_bytes) {
5380                 ticket = list_first_entry(head, struct reserve_ticket,
5381                                           list);
5382                 /*
5383                  * We use 0 bytes because this space is already reserved, so
5384                  * adding the ticket space would be a double count.
5385                  */
5386                 if (check_overcommit &&
5387                     !can_overcommit(fs_info->extent_root, space_info, 0,
5388                                     flush))
5389                         break;
5390                 if (num_bytes >= ticket->bytes) {
5391                         list_del_init(&ticket->list);
5392                         num_bytes -= ticket->bytes;
5393                         ticket->bytes = 0;
5394                         wake_up(&ticket->wait);
5395                 } else {
5396                         ticket->bytes -= num_bytes;
5397                         num_bytes = 0;
5398                 }
5399         }
5400
5401         if (num_bytes && head == &space_info->priority_tickets) {
5402                 head = &space_info->tickets;
5403                 flush = BTRFS_RESERVE_FLUSH_ALL;
5404                 goto again;
5405         }
5406         space_info->bytes_may_use -= num_bytes;
5407         trace_btrfs_space_reservation(fs_info, "space_info",
5408                                       space_info->flags, num_bytes, 0);
5409         spin_unlock(&space_info->lock);
5410 }
5411
5412 /*
5413  * This is for newly allocated space that isn't accounted in
5414  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5415  * we use this helper.
5416  */
5417 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5418                                      struct btrfs_space_info *space_info,
5419                                      u64 num_bytes)
5420 {
5421         struct reserve_ticket *ticket;
5422         struct list_head *head = &space_info->priority_tickets;
5423
5424 again:
5425         while (!list_empty(head) && num_bytes) {
5426                 ticket = list_first_entry(head, struct reserve_ticket,
5427                                           list);
5428                 if (num_bytes >= ticket->bytes) {
5429                         trace_btrfs_space_reservation(fs_info, "space_info",
5430                                                       space_info->flags,
5431                                                       ticket->bytes, 1);
5432                         list_del_init(&ticket->list);
5433                         num_bytes -= ticket->bytes;
5434                         space_info->bytes_may_use += ticket->bytes;
5435                         ticket->bytes = 0;
5436                         wake_up(&ticket->wait);
5437                 } else {
5438                         trace_btrfs_space_reservation(fs_info, "space_info",
5439                                                       space_info->flags,
5440                                                       num_bytes, 1);
5441                         space_info->bytes_may_use += num_bytes;
5442                         ticket->bytes -= num_bytes;
5443                         num_bytes = 0;
5444                 }
5445         }
5446
5447         if (num_bytes && head == &space_info->priority_tickets) {
5448                 head = &space_info->tickets;
5449                 goto again;
5450         }
5451 }
5452
5453 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5454                                     struct btrfs_block_rsv *block_rsv,
5455                                     struct btrfs_block_rsv *dest, u64 num_bytes)
5456 {
5457         struct btrfs_space_info *space_info = block_rsv->space_info;
5458
5459         spin_lock(&block_rsv->lock);
5460         if (num_bytes == (u64)-1)
5461                 num_bytes = block_rsv->size;
5462         block_rsv->size -= num_bytes;
5463         if (block_rsv->reserved >= block_rsv->size) {
5464                 num_bytes = block_rsv->reserved - block_rsv->size;
5465                 block_rsv->reserved = block_rsv->size;
5466                 block_rsv->full = 1;
5467         } else {
5468                 num_bytes = 0;
5469         }
5470         spin_unlock(&block_rsv->lock);
5471
5472         if (num_bytes > 0) {
5473                 if (dest) {
5474                         spin_lock(&dest->lock);
5475                         if (!dest->full) {
5476                                 u64 bytes_to_add;
5477
5478                                 bytes_to_add = dest->size - dest->reserved;
5479                                 bytes_to_add = min(num_bytes, bytes_to_add);
5480                                 dest->reserved += bytes_to_add;
5481                                 if (dest->reserved >= dest->size)
5482                                         dest->full = 1;
5483                                 num_bytes -= bytes_to_add;
5484                         }
5485                         spin_unlock(&dest->lock);
5486                 }
5487                 if (num_bytes)
5488                         space_info_add_old_bytes(fs_info, space_info,
5489                                                  num_bytes);
5490         }
5491 }
5492
5493 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5494                             struct btrfs_block_rsv *dst, u64 num_bytes,
5495                             int update_size)
5496 {
5497         int ret;
5498
5499         ret = block_rsv_use_bytes(src, num_bytes);
5500         if (ret)
5501                 return ret;
5502
5503         block_rsv_add_bytes(dst, num_bytes, update_size);
5504         return 0;
5505 }
5506
5507 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5508 {
5509         memset(rsv, 0, sizeof(*rsv));
5510         spin_lock_init(&rsv->lock);
5511         rsv->type = type;
5512 }
5513
5514 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5515                                               unsigned short type)
5516 {
5517         struct btrfs_block_rsv *block_rsv;
5518         struct btrfs_fs_info *fs_info = root->fs_info;
5519
5520         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5521         if (!block_rsv)
5522                 return NULL;
5523
5524         btrfs_init_block_rsv(block_rsv, type);
5525         block_rsv->space_info = __find_space_info(fs_info,
5526                                                   BTRFS_BLOCK_GROUP_METADATA);
5527         return block_rsv;
5528 }
5529
5530 void btrfs_free_block_rsv(struct btrfs_root *root,
5531                           struct btrfs_block_rsv *rsv)
5532 {
5533         if (!rsv)
5534                 return;
5535         btrfs_block_rsv_release(root, rsv, (u64)-1);
5536         kfree(rsv);
5537 }
5538
5539 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5540 {
5541         kfree(rsv);
5542 }
5543
5544 int btrfs_block_rsv_add(struct btrfs_root *root,
5545                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5546                         enum btrfs_reserve_flush_enum flush)
5547 {
5548         int ret;
5549
5550         if (num_bytes == 0)
5551                 return 0;
5552
5553         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5554         if (!ret) {
5555                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5556                 return 0;
5557         }
5558
5559         return ret;
5560 }
5561
5562 int btrfs_block_rsv_check(struct btrfs_root *root,
5563                           struct btrfs_block_rsv *block_rsv, int min_factor)
5564 {
5565         u64 num_bytes = 0;
5566         int ret = -ENOSPC;
5567
5568         if (!block_rsv)
5569                 return 0;
5570
5571         spin_lock(&block_rsv->lock);
5572         num_bytes = div_factor(block_rsv->size, min_factor);
5573         if (block_rsv->reserved >= num_bytes)
5574                 ret = 0;
5575         spin_unlock(&block_rsv->lock);
5576
5577         return ret;
5578 }
5579
5580 int btrfs_block_rsv_refill(struct btrfs_root *root,
5581                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5582                            enum btrfs_reserve_flush_enum flush)
5583 {
5584         u64 num_bytes = 0;
5585         int ret = -ENOSPC;
5586
5587         if (!block_rsv)
5588                 return 0;
5589
5590         spin_lock(&block_rsv->lock);
5591         num_bytes = min_reserved;
5592         if (block_rsv->reserved >= num_bytes)
5593                 ret = 0;
5594         else
5595                 num_bytes -= block_rsv->reserved;
5596         spin_unlock(&block_rsv->lock);
5597
5598         if (!ret)
5599                 return 0;
5600
5601         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5602         if (!ret) {
5603                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5604                 return 0;
5605         }
5606
5607         return ret;
5608 }
5609
5610 void btrfs_block_rsv_release(struct btrfs_root *root,
5611                              struct btrfs_block_rsv *block_rsv,
5612                              u64 num_bytes)
5613 {
5614         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5615         if (global_rsv == block_rsv ||
5616             block_rsv->space_info != global_rsv->space_info)
5617                 global_rsv = NULL;
5618         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5619                                 num_bytes);
5620 }
5621
5622 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5623 {
5624         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5625         struct btrfs_space_info *sinfo = block_rsv->space_info;
5626         u64 num_bytes;
5627
5628         /*
5629          * The global block rsv is based on the size of the extent tree, the
5630          * checksum tree and the root tree.  If the fs is empty we want to set
5631          * it to a minimal amount for safety.
5632          */
5633         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5634                 btrfs_root_used(&fs_info->csum_root->root_item) +
5635                 btrfs_root_used(&fs_info->tree_root->root_item);
5636         num_bytes = max_t(u64, num_bytes, SZ_16M);
5637
5638         spin_lock(&sinfo->lock);
5639         spin_lock(&block_rsv->lock);
5640
5641         block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5642
5643         if (block_rsv->reserved < block_rsv->size) {
5644                 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5645                         sinfo->bytes_reserved + sinfo->bytes_readonly +
5646                         sinfo->bytes_may_use;
5647                 if (sinfo->total_bytes > num_bytes) {
5648                         num_bytes = sinfo->total_bytes - num_bytes;
5649                         num_bytes = min(num_bytes,
5650                                         block_rsv->size - block_rsv->reserved);
5651                         block_rsv->reserved += num_bytes;
5652                         sinfo->bytes_may_use += num_bytes;
5653                         trace_btrfs_space_reservation(fs_info, "space_info",
5654                                                       sinfo->flags, num_bytes,
5655                                                       1);
5656                 }
5657         } else if (block_rsv->reserved > block_rsv->size) {
5658                 num_bytes = block_rsv->reserved - block_rsv->size;
5659                 sinfo->bytes_may_use -= num_bytes;
5660                 trace_btrfs_space_reservation(fs_info, "space_info",
5661                                       sinfo->flags, num_bytes, 0);
5662                 block_rsv->reserved = block_rsv->size;
5663         }
5664
5665         if (block_rsv->reserved == block_rsv->size)
5666                 block_rsv->full = 1;
5667         else
5668                 block_rsv->full = 0;
5669
5670         spin_unlock(&block_rsv->lock);
5671         spin_unlock(&sinfo->lock);
5672 }
5673
5674 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5675 {
5676         struct btrfs_space_info *space_info;
5677
5678         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5679         fs_info->chunk_block_rsv.space_info = space_info;
5680
5681         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5682         fs_info->global_block_rsv.space_info = space_info;
5683         fs_info->delalloc_block_rsv.space_info = space_info;
5684         fs_info->trans_block_rsv.space_info = space_info;
5685         fs_info->empty_block_rsv.space_info = space_info;
5686         fs_info->delayed_block_rsv.space_info = space_info;
5687
5688         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5689         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5690         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5691         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5692         if (fs_info->quota_root)
5693                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5694         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5695
5696         update_global_block_rsv(fs_info);
5697 }
5698
5699 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5700 {
5701         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5702                                 (u64)-1);
5703         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5704         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5705         WARN_ON(fs_info->trans_block_rsv.size > 0);
5706         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5707         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5708         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5709         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5710         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5711 }
5712
5713 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5714                                   struct btrfs_root *root)
5715 {
5716         if (!trans->block_rsv)
5717                 return;
5718
5719         if (!trans->bytes_reserved)
5720                 return;
5721
5722         trace_btrfs_space_reservation(root->fs_info, "transaction",
5723                                       trans->transid, trans->bytes_reserved, 0);
5724         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5725         trans->bytes_reserved = 0;
5726 }
5727
5728 /*
5729  * To be called after all the new block groups attached to the transaction
5730  * handle have been created (btrfs_create_pending_block_groups()).
5731  */
5732 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5733 {
5734         struct btrfs_fs_info *fs_info = trans->fs_info;
5735
5736         if (!trans->chunk_bytes_reserved)
5737                 return;
5738
5739         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5740
5741         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5742                                 trans->chunk_bytes_reserved);
5743         trans->chunk_bytes_reserved = 0;
5744 }
5745
5746 /* Can only return 0 or -ENOSPC */
5747 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5748                                   struct inode *inode)
5749 {
5750         struct btrfs_root *root = BTRFS_I(inode)->root;
5751         /*
5752          * We always use trans->block_rsv here as we will have reserved space
5753          * for our orphan when starting the transaction, using get_block_rsv()
5754          * here will sometimes make us choose the wrong block rsv as we could be
5755          * doing a reloc inode for a non refcounted root.
5756          */
5757         struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5758         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5759
5760         /*
5761          * We need to hold space in order to delete our orphan item once we've
5762          * added it, so this takes the reservation so we can release it later
5763          * when we are truly done with the orphan item.
5764          */
5765         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5766         trace_btrfs_space_reservation(root->fs_info, "orphan",
5767                                       btrfs_ino(inode), num_bytes, 1);
5768         return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
5769 }
5770
5771 void btrfs_orphan_release_metadata(struct inode *inode)
5772 {
5773         struct btrfs_root *root = BTRFS_I(inode)->root;
5774         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5775         trace_btrfs_space_reservation(root->fs_info, "orphan",
5776                                       btrfs_ino(inode), num_bytes, 0);
5777         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5778 }
5779
5780 /*
5781  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5782  * root: the root of the parent directory
5783  * rsv: block reservation
5784  * items: the number of items that we need do reservation
5785  * qgroup_reserved: used to return the reserved size in qgroup
5786  *
5787  * This function is used to reserve the space for snapshot/subvolume
5788  * creation and deletion. Those operations are different with the
5789  * common file/directory operations, they change two fs/file trees
5790  * and root tree, the number of items that the qgroup reserves is
5791  * different with the free space reservation. So we can not use
5792  * the space reservation mechanism in start_transaction().
5793  */
5794 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5795                                      struct btrfs_block_rsv *rsv,
5796                                      int items,
5797                                      u64 *qgroup_reserved,
5798                                      bool use_global_rsv)
5799 {
5800         u64 num_bytes;
5801         int ret;
5802         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5803
5804         if (root->fs_info->quota_enabled) {
5805                 /* One for parent inode, two for dir entries */
5806                 num_bytes = 3 * root->nodesize;
5807                 ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5808                 if (ret)
5809                         return ret;
5810         } else {
5811                 num_bytes = 0;
5812         }
5813
5814         *qgroup_reserved = num_bytes;
5815
5816         num_bytes = btrfs_calc_trans_metadata_size(root, items);
5817         rsv->space_info = __find_space_info(root->fs_info,
5818                                             BTRFS_BLOCK_GROUP_METADATA);
5819         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5820                                   BTRFS_RESERVE_FLUSH_ALL);
5821
5822         if (ret == -ENOSPC && use_global_rsv)
5823                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5824
5825         if (ret && *qgroup_reserved)
5826                 btrfs_qgroup_free_meta(root, *qgroup_reserved);
5827
5828         return ret;
5829 }
5830
5831 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5832                                       struct btrfs_block_rsv *rsv,
5833                                       u64 qgroup_reserved)
5834 {
5835         btrfs_block_rsv_release(root, rsv, (u64)-1);
5836 }
5837
5838 /**
5839  * drop_outstanding_extent - drop an outstanding extent
5840  * @inode: the inode we're dropping the extent for
5841  * @num_bytes: the number of bytes we're releasing.
5842  *
5843  * This is called when we are freeing up an outstanding extent, either called
5844  * after an error or after an extent is written.  This will return the number of
5845  * reserved extents that need to be freed.  This must be called with
5846  * BTRFS_I(inode)->lock held.
5847  */
5848 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5849 {
5850         unsigned drop_inode_space = 0;
5851         unsigned dropped_extents = 0;
5852         unsigned num_extents = 0;
5853
5854         num_extents = (unsigned)div64_u64(num_bytes +
5855                                           BTRFS_MAX_EXTENT_SIZE - 1,
5856                                           BTRFS_MAX_EXTENT_SIZE);
5857         ASSERT(num_extents);
5858         ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5859         BTRFS_I(inode)->outstanding_extents -= num_extents;
5860
5861         if (BTRFS_I(inode)->outstanding_extents == 0 &&
5862             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5863                                &BTRFS_I(inode)->runtime_flags))
5864                 drop_inode_space = 1;
5865
5866         /*
5867          * If we have more or the same amount of outstanding extents than we have
5868          * reserved then we need to leave the reserved extents count alone.
5869          */
5870         if (BTRFS_I(inode)->outstanding_extents >=
5871             BTRFS_I(inode)->reserved_extents)
5872                 return drop_inode_space;
5873
5874         dropped_extents = BTRFS_I(inode)->reserved_extents -
5875                 BTRFS_I(inode)->outstanding_extents;
5876         BTRFS_I(inode)->reserved_extents -= dropped_extents;
5877         return dropped_extents + drop_inode_space;
5878 }
5879
5880 /**
5881  * calc_csum_metadata_size - return the amount of metadata space that must be
5882  *      reserved/freed for the given bytes.
5883  * @inode: the inode we're manipulating
5884  * @num_bytes: the number of bytes in question
5885  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5886  *
5887  * This adjusts the number of csum_bytes in the inode and then returns the
5888  * correct amount of metadata that must either be reserved or freed.  We
5889  * calculate how many checksums we can fit into one leaf and then divide the
5890  * number of bytes that will need to be checksumed by this value to figure out
5891  * how many checksums will be required.  If we are adding bytes then the number
5892  * may go up and we will return the number of additional bytes that must be
5893  * reserved.  If it is going down we will return the number of bytes that must
5894  * be freed.
5895  *
5896  * This must be called with BTRFS_I(inode)->lock held.
5897  */
5898 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5899                                    int reserve)
5900 {
5901         struct btrfs_root *root = BTRFS_I(inode)->root;
5902         u64 old_csums, num_csums;
5903
5904         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5905             BTRFS_I(inode)->csum_bytes == 0)
5906                 return 0;
5907
5908         old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5909         if (reserve)
5910                 BTRFS_I(inode)->csum_bytes += num_bytes;
5911         else
5912                 BTRFS_I(inode)->csum_bytes -= num_bytes;
5913         num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5914
5915         /* No change, no need to reserve more */
5916         if (old_csums == num_csums)
5917                 return 0;
5918
5919         if (reserve)
5920                 return btrfs_calc_trans_metadata_size(root,
5921                                                       num_csums - old_csums);
5922
5923         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5924 }
5925
5926 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5927 {
5928         struct btrfs_root *root = BTRFS_I(inode)->root;
5929         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5930         u64 to_reserve = 0;
5931         u64 csum_bytes;
5932         unsigned nr_extents = 0;
5933         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5934         int ret = 0;
5935         bool delalloc_lock = true;
5936         u64 to_free = 0;
5937         unsigned dropped;
5938         bool release_extra = false;
5939
5940         /* If we are a free space inode we need to not flush since we will be in
5941          * the middle of a transaction commit.  We also don't need the delalloc
5942          * mutex since we won't race with anybody.  We need this mostly to make
5943          * lockdep shut its filthy mouth.
5944          *
5945          * If we have a transaction open (can happen if we call truncate_block
5946          * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5947          */
5948         if (btrfs_is_free_space_inode(inode)) {
5949                 flush = BTRFS_RESERVE_NO_FLUSH;
5950                 delalloc_lock = false;
5951         } else if (current->journal_info) {
5952                 flush = BTRFS_RESERVE_FLUSH_LIMIT;
5953         }
5954
5955         if (flush != BTRFS_RESERVE_NO_FLUSH &&
5956             btrfs_transaction_in_commit(root->fs_info))
5957                 schedule_timeout(1);
5958
5959         if (delalloc_lock)
5960                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5961
5962         num_bytes = ALIGN(num_bytes, root->sectorsize);
5963
5964         spin_lock(&BTRFS_I(inode)->lock);
5965         nr_extents = (unsigned)div64_u64(num_bytes +
5966                                          BTRFS_MAX_EXTENT_SIZE - 1,
5967                                          BTRFS_MAX_EXTENT_SIZE);
5968         BTRFS_I(inode)->outstanding_extents += nr_extents;
5969
5970         nr_extents = 0;
5971         if (BTRFS_I(inode)->outstanding_extents >
5972             BTRFS_I(inode)->reserved_extents)
5973                 nr_extents += BTRFS_I(inode)->outstanding_extents -
5974                         BTRFS_I(inode)->reserved_extents;
5975
5976         /* We always want to reserve a slot for updating the inode. */
5977         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
5978         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5979         csum_bytes = BTRFS_I(inode)->csum_bytes;
5980         spin_unlock(&BTRFS_I(inode)->lock);
5981
5982         if (root->fs_info->quota_enabled) {
5983                 ret = btrfs_qgroup_reserve_meta(root,
5984                                 nr_extents * root->nodesize);
5985                 if (ret)
5986                         goto out_fail;
5987         }
5988
5989         ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
5990         if (unlikely(ret)) {
5991                 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
5992                 goto out_fail;
5993         }
5994
5995         spin_lock(&BTRFS_I(inode)->lock);
5996         if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5997                              &BTRFS_I(inode)->runtime_flags)) {
5998                 to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
5999                 release_extra = true;
6000         }
6001         BTRFS_I(inode)->reserved_extents += nr_extents;
6002         spin_unlock(&BTRFS_I(inode)->lock);
6003
6004         if (delalloc_lock)
6005                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
6006
6007         if (to_reserve)
6008                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
6009                                               btrfs_ino(inode), to_reserve, 1);
6010         if (release_extra)
6011                 btrfs_block_rsv_release(root, block_rsv,
6012                                         btrfs_calc_trans_metadata_size(root,
6013                                                                        1));
6014         return 0;
6015
6016 out_fail:
6017         spin_lock(&BTRFS_I(inode)->lock);
6018         dropped = drop_outstanding_extent(inode, num_bytes);
6019         /*
6020          * If the inodes csum_bytes is the same as the original
6021          * csum_bytes then we know we haven't raced with any free()ers
6022          * so we can just reduce our inodes csum bytes and carry on.
6023          */
6024         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
6025                 calc_csum_metadata_size(inode, num_bytes, 0);
6026         } else {
6027                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
6028                 u64 bytes;
6029
6030                 /*
6031                  * This is tricky, but first we need to figure out how much we
6032                  * freed from any free-ers that occurred during this
6033                  * reservation, so we reset ->csum_bytes to the csum_bytes
6034                  * before we dropped our lock, and then call the free for the
6035                  * number of bytes that were freed while we were trying our
6036                  * reservation.
6037                  */
6038                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
6039                 BTRFS_I(inode)->csum_bytes = csum_bytes;
6040                 to_free = calc_csum_metadata_size(inode, bytes, 0);
6041
6042
6043                 /*
6044                  * Now we need to see how much we would have freed had we not
6045                  * been making this reservation and our ->csum_bytes were not
6046                  * artificially inflated.
6047                  */
6048                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
6049                 bytes = csum_bytes - orig_csum_bytes;
6050                 bytes = calc_csum_metadata_size(inode, bytes, 0);
6051
6052                 /*
6053                  * Now reset ->csum_bytes to what it should be.  If bytes is
6054                  * more than to_free then we would have freed more space had we
6055                  * not had an artificially high ->csum_bytes, so we need to free
6056                  * the remainder.  If bytes is the same or less then we don't
6057                  * need to do anything, the other free-ers did the correct
6058                  * thing.
6059                  */
6060                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
6061                 if (bytes > to_free)
6062                         to_free = bytes - to_free;
6063                 else
6064                         to_free = 0;
6065         }
6066         spin_unlock(&BTRFS_I(inode)->lock);
6067         if (dropped)
6068                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
6069
6070         if (to_free) {
6071                 btrfs_block_rsv_release(root, block_rsv, to_free);
6072                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
6073                                               btrfs_ino(inode), to_free, 0);
6074         }
6075         if (delalloc_lock)
6076                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
6077         return ret;
6078 }
6079
6080 /**
6081  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6082  * @inode: the inode to release the reservation for
6083  * @num_bytes: the number of bytes we're releasing
6084  *
6085  * This will release the metadata reservation for an inode.  This can be called
6086  * once we complete IO for a given set of bytes to release their metadata
6087  * reservations.
6088  */
6089 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
6090 {
6091         struct btrfs_root *root = BTRFS_I(inode)->root;
6092         u64 to_free = 0;
6093         unsigned dropped;
6094
6095         num_bytes = ALIGN(num_bytes, root->sectorsize);
6096         spin_lock(&BTRFS_I(inode)->lock);
6097         dropped = drop_outstanding_extent(inode, num_bytes);
6098
6099         if (num_bytes)
6100                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6101         spin_unlock(&BTRFS_I(inode)->lock);
6102         if (dropped > 0)
6103                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
6104
6105         if (btrfs_is_testing(root->fs_info))
6106                 return;
6107
6108         trace_btrfs_space_reservation(root->fs_info, "delalloc",
6109                                       btrfs_ino(inode), to_free, 0);
6110
6111         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
6112                                 to_free);
6113 }
6114
6115 /**
6116  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6117  * delalloc
6118  * @inode: inode we're writing to
6119  * @start: start range we are writing to
6120  * @len: how long the range we are writing to
6121  *
6122  * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
6123  *
6124  * This will do the following things
6125  *
6126  * o reserve space in data space info for num bytes
6127  *   and reserve precious corresponding qgroup space
6128  *   (Done in check_data_free_space)
6129  *
6130  * o reserve space for metadata space, based on the number of outstanding
6131  *   extents and how much csums will be needed
6132  *   also reserve metadata space in a per root over-reserve method.
6133  * o add to the inodes->delalloc_bytes
6134  * o add it to the fs_info's delalloc inodes list.
6135  *   (Above 3 all done in delalloc_reserve_metadata)
6136  *
6137  * Return 0 for success
6138  * Return <0 for error(-ENOSPC or -EQUOT)
6139  */
6140 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
6141 {
6142         int ret;
6143
6144         ret = btrfs_check_data_free_space(inode, start, len);
6145         if (ret < 0)
6146                 return ret;
6147         ret = btrfs_delalloc_reserve_metadata(inode, len);
6148         if (ret < 0)
6149                 btrfs_free_reserved_data_space(inode, start, len);
6150         return ret;
6151 }
6152
6153 /**
6154  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6155  * @inode: inode we're releasing space for
6156  * @start: start position of the space already reserved
6157  * @len: the len of the space already reserved
6158  *
6159  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
6160  * called in the case that we don't need the metadata AND data reservations
6161  * anymore.  So if there is an error or we insert an inline extent.
6162  *
6163  * This function will release the metadata space that was not used and will
6164  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6165  * list if there are no delalloc bytes left.
6166  * Also it will handle the qgroup reserved space.
6167  */
6168 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
6169 {
6170         btrfs_delalloc_release_metadata(inode, len);
6171         btrfs_free_reserved_data_space(inode, start, len);
6172 }
6173
6174 static int update_block_group(struct btrfs_trans_handle *trans,
6175                               struct btrfs_root *root, u64 bytenr,
6176                               u64 num_bytes, int alloc)
6177 {
6178         struct btrfs_block_group_cache *cache = NULL;
6179         struct btrfs_fs_info *info = root->fs_info;
6180         u64 total = num_bytes;
6181         u64 old_val;
6182         u64 byte_in_group;
6183         int factor;
6184
6185         /* block accounting for super block */
6186         spin_lock(&info->delalloc_root_lock);
6187         old_val = btrfs_super_bytes_used(info->super_copy);
6188         if (alloc)
6189                 old_val += num_bytes;
6190         else
6191                 old_val -= num_bytes;
6192         btrfs_set_super_bytes_used(info->super_copy, old_val);
6193         spin_unlock(&info->delalloc_root_lock);
6194
6195         while (total) {
6196                 cache = btrfs_lookup_block_group(info, bytenr);
6197                 if (!cache)
6198                         return -ENOENT;
6199                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6200                                     BTRFS_BLOCK_GROUP_RAID1 |
6201                                     BTRFS_BLOCK_GROUP_RAID10))
6202                         factor = 2;
6203                 else
6204                         factor = 1;
6205                 /*
6206                  * If this block group has free space cache written out, we
6207                  * need to make sure to load it if we are removing space.  This
6208                  * is because we need the unpinning stage to actually add the
6209                  * space back to the block group, otherwise we will leak space.
6210                  */
6211                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6212                         cache_block_group(cache, 1);
6213
6214                 byte_in_group = bytenr - cache->key.objectid;
6215                 WARN_ON(byte_in_group > cache->key.offset);
6216
6217                 spin_lock(&cache->space_info->lock);
6218                 spin_lock(&cache->lock);
6219
6220                 if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
6221                     cache->disk_cache_state < BTRFS_DC_CLEAR)
6222                         cache->disk_cache_state = BTRFS_DC_CLEAR;
6223
6224                 old_val = btrfs_block_group_used(&cache->item);
6225                 num_bytes = min(total, cache->key.offset - byte_in_group);
6226                 if (alloc) {
6227                         old_val += num_bytes;
6228                         btrfs_set_block_group_used(&cache->item, old_val);
6229                         cache->reserved -= num_bytes;
6230                         cache->space_info->bytes_reserved -= num_bytes;
6231                         cache->space_info->bytes_used += num_bytes;
6232                         cache->space_info->disk_used += num_bytes * factor;
6233                         spin_unlock(&cache->lock);
6234                         spin_unlock(&cache->space_info->lock);
6235                 } else {
6236                         old_val -= num_bytes;
6237                         btrfs_set_block_group_used(&cache->item, old_val);
6238                         cache->pinned += num_bytes;
6239                         cache->space_info->bytes_pinned += num_bytes;
6240                         cache->space_info->bytes_used -= num_bytes;
6241                         cache->space_info->disk_used -= num_bytes * factor;
6242                         spin_unlock(&cache->lock);
6243                         spin_unlock(&cache->space_info->lock);
6244
6245                         trace_btrfs_space_reservation(root->fs_info, "pinned",
6246                                                       cache->space_info->flags,
6247                                                       num_bytes, 1);
6248                         set_extent_dirty(info->pinned_extents,
6249                                          bytenr, bytenr + num_bytes - 1,
6250                                          GFP_NOFS | __GFP_NOFAIL);
6251                 }
6252
6253                 spin_lock(&trans->transaction->dirty_bgs_lock);
6254                 if (list_empty(&cache->dirty_list)) {
6255                         list_add_tail(&cache->dirty_list,
6256                                       &trans->transaction->dirty_bgs);
6257                                 trans->transaction->num_dirty_bgs++;
6258                         btrfs_get_block_group(cache);
6259                 }
6260                 spin_unlock(&trans->transaction->dirty_bgs_lock);
6261
6262                 /*
6263                  * No longer have used bytes in this block group, queue it for
6264                  * deletion. We do this after adding the block group to the
6265                  * dirty list to avoid races between cleaner kthread and space
6266                  * cache writeout.
6267                  */
6268                 if (!alloc && old_val == 0) {
6269                         spin_lock(&info->unused_bgs_lock);
6270                         if (list_empty(&cache->bg_list)) {
6271                                 btrfs_get_block_group(cache);
6272                                 list_add_tail(&cache->bg_list,
6273                                               &info->unused_bgs);
6274                         }
6275                         spin_unlock(&info->unused_bgs_lock);
6276                 }
6277
6278                 btrfs_put_block_group(cache);
6279                 total -= num_bytes;
6280                 bytenr += num_bytes;
6281         }
6282         return 0;
6283 }
6284
6285 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
6286 {
6287         struct btrfs_block_group_cache *cache;
6288         u64 bytenr;
6289
6290         spin_lock(&root->fs_info->block_group_cache_lock);
6291         bytenr = root->fs_info->first_logical_byte;
6292         spin_unlock(&root->fs_info->block_group_cache_lock);
6293
6294         if (bytenr < (u64)-1)
6295                 return bytenr;
6296
6297         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
6298         if (!cache)
6299                 return 0;
6300
6301         bytenr = cache->key.objectid;
6302         btrfs_put_block_group(cache);
6303
6304         return bytenr;
6305 }
6306
6307 static int pin_down_extent(struct btrfs_root *root,
6308                            struct btrfs_block_group_cache *cache,
6309                            u64 bytenr, u64 num_bytes, int reserved)
6310 {
6311         spin_lock(&cache->space_info->lock);
6312         spin_lock(&cache->lock);
6313         cache->pinned += num_bytes;
6314         cache->space_info->bytes_pinned += num_bytes;
6315         if (reserved) {
6316                 cache->reserved -= num_bytes;
6317                 cache->space_info->bytes_reserved -= num_bytes;
6318         }
6319         spin_unlock(&cache->lock);
6320         spin_unlock(&cache->space_info->lock);
6321
6322         trace_btrfs_space_reservation(root->fs_info, "pinned",
6323                                       cache->space_info->flags, num_bytes, 1);
6324         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
6325                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6326         return 0;
6327 }
6328
6329 /*
6330  * this function must be called within transaction
6331  */
6332 int btrfs_pin_extent(struct btrfs_root *root,
6333                      u64 bytenr, u64 num_bytes, int reserved)
6334 {
6335         struct btrfs_block_group_cache *cache;
6336
6337         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6338         BUG_ON(!cache); /* Logic error */
6339
6340         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
6341
6342         btrfs_put_block_group(cache);
6343         return 0;
6344 }
6345
6346 /*
6347  * this function must be called within transaction
6348  */
6349 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
6350                                     u64 bytenr, u64 num_bytes)
6351 {
6352         struct btrfs_block_group_cache *cache;
6353         int ret;
6354
6355         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6356         if (!cache)
6357                 return -EINVAL;
6358
6359         /*
6360          * pull in the free space cache (if any) so that our pin
6361          * removes the free space from the cache.  We have load_only set
6362          * to one because the slow code to read in the free extents does check
6363          * the pinned extents.
6364          */
6365         cache_block_group(cache, 1);
6366
6367         pin_down_extent(root, cache, bytenr, num_bytes, 0);
6368
6369         /* remove us from the free space cache (if we're there at all) */
6370         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6371         btrfs_put_block_group(cache);
6372         return ret;
6373 }
6374
6375 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
6376 {
6377         int ret;
6378         struct btrfs_block_group_cache *block_group;
6379         struct btrfs_caching_control *caching_ctl;
6380
6381         block_group = btrfs_lookup_block_group(root->fs_info, start);
6382         if (!block_group)
6383                 return -EINVAL;
6384
6385         cache_block_group(block_group, 0);
6386         caching_ctl = get_caching_control(block_group);
6387
6388         if (!caching_ctl) {
6389                 /* Logic error */
6390                 BUG_ON(!block_group_cache_done(block_group));
6391                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6392         } else {
6393                 mutex_lock(&caching_ctl->mutex);
6394
6395                 if (start >= caching_ctl->progress) {
6396                         ret = add_excluded_extent(root, start, num_bytes);
6397                 } else if (start + num_bytes <= caching_ctl->progress) {
6398                         ret = btrfs_remove_free_space(block_group,
6399                                                       start, num_bytes);
6400                 } else {
6401                         num_bytes = caching_ctl->progress - start;
6402                         ret = btrfs_remove_free_space(block_group,
6403                                                       start, num_bytes);
6404                         if (ret)
6405                                 goto out_lock;
6406
6407                         num_bytes = (start + num_bytes) -
6408                                 caching_ctl->progress;
6409                         start = caching_ctl->progress;
6410                         ret = add_excluded_extent(root, start, num_bytes);
6411                 }
6412 out_lock:
6413                 mutex_unlock(&caching_ctl->mutex);
6414                 put_caching_control(caching_ctl);
6415         }
6416         btrfs_put_block_group(block_group);
6417         return ret;
6418 }
6419
6420 int btrfs_exclude_logged_extents(struct btrfs_root *log,
6421                                  struct extent_buffer *eb)
6422 {
6423         struct btrfs_file_extent_item *item;
6424         struct btrfs_key key;
6425         int found_type;
6426         int i;
6427
6428         if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
6429                 return 0;
6430
6431         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6432                 btrfs_item_key_to_cpu(eb, &key, i);
6433                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6434                         continue;
6435                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6436                 found_type = btrfs_file_extent_type(eb, item);
6437                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6438                         continue;
6439                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6440                         continue;
6441                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6442                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6443                 __exclude_logged_extent(log, key.objectid, key.offset);
6444         }
6445
6446         return 0;
6447 }
6448
6449 static void
6450 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6451 {
6452         atomic_inc(&bg->reservations);
6453 }
6454
6455 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6456                                         const u64 start)
6457 {
6458         struct btrfs_block_group_cache *bg;
6459
6460         bg = btrfs_lookup_block_group(fs_info, start);
6461         ASSERT(bg);
6462         if (atomic_dec_and_test(&bg->reservations))
6463                 wake_up_atomic_t(&bg->reservations);
6464         btrfs_put_block_group(bg);
6465 }
6466
6467 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6468 {
6469         schedule();
6470         return 0;
6471 }
6472
6473 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6474 {
6475         struct btrfs_space_info *space_info = bg->space_info;
6476
6477         ASSERT(bg->ro);
6478
6479         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6480                 return;
6481
6482         /*
6483          * Our block group is read only but before we set it to read only,
6484          * some task might have had allocated an extent from it already, but it
6485          * has not yet created a respective ordered extent (and added it to a
6486          * root's list of ordered extents).
6487          * Therefore wait for any task currently allocating extents, since the
6488          * block group's reservations counter is incremented while a read lock
6489          * on the groups' semaphore is held and decremented after releasing
6490          * the read access on that semaphore and creating the ordered extent.
6491          */
6492         down_write(&space_info->groups_sem);
6493         up_write(&space_info->groups_sem);
6494
6495         wait_on_atomic_t(&bg->reservations,
6496                          btrfs_wait_bg_reservations_atomic_t,
6497                          TASK_UNINTERRUPTIBLE);
6498 }
6499
6500 /**
6501  * btrfs_add_reserved_bytes - update the block_group and space info counters
6502  * @cache:      The cache we are manipulating
6503  * @num_bytes:  The number of bytes in question
6504  * @reserve:    One of the reservation enums
6505  * @delalloc:   The blocks are allocated for the delalloc write
6506  *
6507  * This is called by the allocator when it reserves space. Metadata
6508  * reservations should be called with RESERVE_ALLOC so we do the proper
6509  * ENOSPC accounting.  For data we handle the reservation through clearing the
6510  * delalloc bits in the io_tree.  We have to do this since we could end up
6511  * allocating less disk space for the amount of data we have reserved in the
6512  * case of compression.
6513  *
6514  * If this is a reservation and the block group has become read only we cannot
6515  * make the reservation and return -EAGAIN, otherwise this function always
6516  * succeeds.
6517  */
6518 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6519                                     u64 num_bytes, int reserve, int delalloc)
6520 {
6521         struct btrfs_space_info *space_info = cache->space_info;
6522         int ret = 0;
6523
6524         spin_lock(&space_info->lock);
6525         spin_lock(&cache->lock);
6526         if (cache->ro) {
6527                 ret = -EAGAIN;
6528         } else {
6529                 cache->reserved += num_bytes;
6530                 space_info->bytes_reserved += num_bytes;
6531                 if (reserve == RESERVE_ALLOC) {
6532                         trace_btrfs_space_reservation(cache->fs_info,
6533                                         "space_info", space_info->flags,
6534                                         num_bytes, 0);
6535                         space_info->bytes_may_use -= num_bytes;
6536                 }
6537
6538                 if (delalloc)
6539                         cache->delalloc_bytes += num_bytes;
6540         }
6541         spin_unlock(&cache->lock);
6542         spin_unlock(&space_info->lock);
6543         return ret;
6544 }
6545
6546 /**
6547  * btrfs_free_reserved_bytes - update the block_group and space info counters
6548  * @cache:      The cache we are manipulating
6549  * @num_bytes:  The number of bytes in question
6550  * @delalloc:   The blocks are allocated for the delalloc write
6551  *
6552  * This is called by somebody who is freeing space that was never actually used
6553  * on disk.  For example if you reserve some space for a new leaf in transaction
6554  * A and before transaction A commits you free that leaf, you call this with
6555  * reserve set to 0 in order to clear the reservation.
6556  */
6557
6558 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6559                                      u64 num_bytes, int delalloc)
6560 {
6561         struct btrfs_space_info *space_info = cache->space_info;
6562         int ret = 0;
6563
6564         spin_lock(&space_info->lock);
6565         spin_lock(&cache->lock);
6566         if (cache->ro)
6567                 space_info->bytes_readonly += num_bytes;
6568         cache->reserved -= num_bytes;
6569         space_info->bytes_reserved -= num_bytes;
6570
6571         if (delalloc)
6572                 cache->delalloc_bytes -= num_bytes;
6573         spin_unlock(&cache->lock);
6574         spin_unlock(&space_info->lock);
6575         return ret;
6576 }
6577 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6578                                 struct btrfs_root *root)
6579 {
6580         struct btrfs_fs_info *fs_info = root->fs_info;
6581         struct btrfs_caching_control *next;
6582         struct btrfs_caching_control *caching_ctl;
6583         struct btrfs_block_group_cache *cache;
6584
6585         down_write(&fs_info->commit_root_sem);
6586
6587         list_for_each_entry_safe(caching_ctl, next,
6588                                  &fs_info->caching_block_groups, list) {
6589                 cache = caching_ctl->block_group;
6590                 if (block_group_cache_done(cache)) {
6591                         cache->last_byte_to_unpin = (u64)-1;
6592                         list_del_init(&caching_ctl->list);
6593                         put_caching_control(caching_ctl);
6594                 } else {
6595                         cache->last_byte_to_unpin = caching_ctl->progress;
6596                 }
6597         }
6598
6599         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6600                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6601         else
6602                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6603
6604         up_write(&fs_info->commit_root_sem);
6605
6606         update_global_block_rsv(fs_info);
6607 }
6608
6609 /*
6610  * Returns the free cluster for the given space info and sets empty_cluster to
6611  * what it should be based on the mount options.
6612  */
6613 static struct btrfs_free_cluster *
6614 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
6615                    u64 *empty_cluster)
6616 {
6617         struct btrfs_free_cluster *ret = NULL;
6618         bool ssd = btrfs_test_opt(root->fs_info, SSD);
6619
6620         *empty_cluster = 0;
6621         if (btrfs_mixed_space_info(space_info))
6622                 return ret;
6623
6624         if (ssd)
6625                 *empty_cluster = SZ_2M;
6626         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6627                 ret = &root->fs_info->meta_alloc_cluster;
6628                 if (!ssd)
6629                         *empty_cluster = SZ_64K;
6630         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
6631                 ret = &root->fs_info->data_alloc_cluster;
6632         }
6633
6634         return ret;
6635 }
6636
6637 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6638                               const bool return_free_space)
6639 {
6640         struct btrfs_fs_info *fs_info = root->fs_info;
6641         struct btrfs_block_group_cache *cache = NULL;
6642         struct btrfs_space_info *space_info;
6643         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6644         struct btrfs_free_cluster *cluster = NULL;
6645         u64 len;
6646         u64 total_unpinned = 0;
6647         u64 empty_cluster = 0;
6648         bool readonly;
6649
6650         while (start <= end) {
6651                 readonly = false;
6652                 if (!cache ||
6653                     start >= cache->key.objectid + cache->key.offset) {
6654                         if (cache)
6655                                 btrfs_put_block_group(cache);
6656                         total_unpinned = 0;
6657                         cache = btrfs_lookup_block_group(fs_info, start);
6658                         BUG_ON(!cache); /* Logic error */
6659
6660                         cluster = fetch_cluster_info(root,
6661                                                      cache->space_info,
6662                                                      &empty_cluster);
6663                         empty_cluster <<= 1;
6664                 }
6665
6666                 len = cache->key.objectid + cache->key.offset - start;
6667                 len = min(len, end + 1 - start);
6668
6669                 if (start < cache->last_byte_to_unpin) {
6670                         len = min(len, cache->last_byte_to_unpin - start);
6671                         if (return_free_space)
6672                                 btrfs_add_free_space(cache, start, len);
6673                 }
6674
6675                 start += len;
6676                 total_unpinned += len;
6677                 space_info = cache->space_info;
6678
6679                 /*
6680                  * If this space cluster has been marked as fragmented and we've
6681                  * unpinned enough in this block group to potentially allow a
6682                  * cluster to be created inside of it go ahead and clear the
6683                  * fragmented check.
6684                  */
6685                 if (cluster && cluster->fragmented &&
6686                     total_unpinned > empty_cluster) {
6687                         spin_lock(&cluster->lock);
6688                         cluster->fragmented = 0;
6689                         spin_unlock(&cluster->lock);
6690                 }
6691
6692                 spin_lock(&space_info->lock);
6693                 spin_lock(&cache->lock);
6694                 cache->pinned -= len;
6695                 space_info->bytes_pinned -= len;
6696
6697                 trace_btrfs_space_reservation(fs_info, "pinned",
6698                                               space_info->flags, len, 0);
6699                 space_info->max_extent_size = 0;
6700                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6701                 if (cache->ro) {
6702                         space_info->bytes_readonly += len;
6703                         readonly = true;
6704                 }
6705                 spin_unlock(&cache->lock);
6706                 if (!readonly && return_free_space &&
6707                     global_rsv->space_info == space_info) {
6708                         u64 to_add = len;
6709                         WARN_ON(!return_free_space);
6710                         spin_lock(&global_rsv->lock);
6711                         if (!global_rsv->full) {
6712                                 to_add = min(len, global_rsv->size -
6713                                              global_rsv->reserved);
6714                                 global_rsv->reserved += to_add;
6715                                 space_info->bytes_may_use += to_add;
6716                                 if (global_rsv->reserved >= global_rsv->size)
6717                                         global_rsv->full = 1;
6718                                 trace_btrfs_space_reservation(fs_info,
6719                                                               "space_info",
6720                                                               space_info->flags,
6721                                                               to_add, 1);
6722                                 len -= to_add;
6723                         }
6724                         spin_unlock(&global_rsv->lock);
6725                         /* Add to any tickets we may have */
6726                         if (len)
6727                                 space_info_add_new_bytes(fs_info, space_info,
6728                                                          len);
6729                 }
6730                 spin_unlock(&space_info->lock);
6731         }
6732
6733         if (cache)
6734                 btrfs_put_block_group(cache);
6735         return 0;
6736 }
6737
6738 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6739                                struct btrfs_root *root)
6740 {
6741         struct btrfs_fs_info *fs_info = root->fs_info;
6742         struct btrfs_block_group_cache *block_group, *tmp;
6743         struct list_head *deleted_bgs;
6744         struct extent_io_tree *unpin;
6745         u64 start;
6746         u64 end;
6747         int ret;
6748
6749         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6750                 unpin = &fs_info->freed_extents[1];
6751         else
6752                 unpin = &fs_info->freed_extents[0];
6753
6754         while (!trans->aborted) {
6755                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6756                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6757                                             EXTENT_DIRTY, NULL);
6758                 if (ret) {
6759                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6760                         break;
6761                 }
6762
6763                 if (btrfs_test_opt(root->fs_info, DISCARD))
6764                         ret = btrfs_discard_extent(root, start,
6765                                                    end + 1 - start, NULL);
6766
6767                 clear_extent_dirty(unpin, start, end);
6768                 unpin_extent_range(root, start, end, true);
6769                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6770                 cond_resched();
6771         }
6772
6773         /*
6774          * Transaction is finished.  We don't need the lock anymore.  We
6775          * do need to clean up the block groups in case of a transaction
6776          * abort.
6777          */
6778         deleted_bgs = &trans->transaction->deleted_bgs;
6779         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6780                 u64 trimmed = 0;
6781
6782                 ret = -EROFS;
6783                 if (!trans->aborted)
6784                         ret = btrfs_discard_extent(root,
6785                                                    block_group->key.objectid,
6786                                                    block_group->key.offset,
6787                                                    &trimmed);
6788
6789                 list_del_init(&block_group->bg_list);
6790                 btrfs_put_block_group_trimming(block_group);
6791                 btrfs_put_block_group(block_group);
6792
6793                 if (ret) {
6794                         const char *errstr = btrfs_decode_error(ret);
6795                         btrfs_warn(fs_info,
6796                                    "Discard failed while removing blockgroup: errno=%d %s\n",
6797                                    ret, errstr);
6798                 }
6799         }
6800
6801         return 0;
6802 }
6803
6804 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6805                              u64 owner, u64 root_objectid)
6806 {
6807         struct btrfs_space_info *space_info;
6808         u64 flags;
6809
6810         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6811                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6812                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
6813                 else
6814                         flags = BTRFS_BLOCK_GROUP_METADATA;
6815         } else {
6816                 flags = BTRFS_BLOCK_GROUP_DATA;
6817         }
6818
6819         space_info = __find_space_info(fs_info, flags);
6820         BUG_ON(!space_info); /* Logic bug */
6821         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6822 }
6823
6824
6825 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6826                                 struct btrfs_root *root,
6827                                 struct btrfs_delayed_ref_node *node, u64 parent,
6828                                 u64 root_objectid, u64 owner_objectid,
6829                                 u64 owner_offset, int refs_to_drop,
6830                                 struct btrfs_delayed_extent_op *extent_op)
6831 {
6832         struct btrfs_key key;
6833         struct btrfs_path *path;
6834         struct btrfs_fs_info *info = root->fs_info;
6835         struct btrfs_root *extent_root = info->extent_root;
6836         struct extent_buffer *leaf;
6837         struct btrfs_extent_item *ei;
6838         struct btrfs_extent_inline_ref *iref;
6839         int ret;
6840         int is_data;
6841         int extent_slot = 0;
6842         int found_extent = 0;
6843         int num_to_del = 1;
6844         u32 item_size;
6845         u64 refs;
6846         u64 bytenr = node->bytenr;
6847         u64 num_bytes = node->num_bytes;
6848         int last_ref = 0;
6849         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6850                                                  SKINNY_METADATA);
6851
6852         path = btrfs_alloc_path();
6853         if (!path)
6854                 return -ENOMEM;
6855
6856         path->reada = READA_FORWARD;
6857         path->leave_spinning = 1;
6858
6859         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6860         BUG_ON(!is_data && refs_to_drop != 1);
6861
6862         if (is_data)
6863                 skinny_metadata = 0;
6864
6865         ret = lookup_extent_backref(trans, extent_root, path, &iref,
6866                                     bytenr, num_bytes, parent,
6867                                     root_objectid, owner_objectid,
6868                                     owner_offset);
6869         if (ret == 0) {
6870                 extent_slot = path->slots[0];
6871                 while (extent_slot >= 0) {
6872                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6873                                               extent_slot);
6874                         if (key.objectid != bytenr)
6875                                 break;
6876                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6877                             key.offset == num_bytes) {
6878                                 found_extent = 1;
6879                                 break;
6880                         }
6881                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6882                             key.offset == owner_objectid) {
6883                                 found_extent = 1;
6884                                 break;
6885                         }
6886                         if (path->slots[0] - extent_slot > 5)
6887                                 break;
6888                         extent_slot--;
6889                 }
6890 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6891                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6892                 if (found_extent && item_size < sizeof(*ei))
6893                         found_extent = 0;
6894 #endif
6895                 if (!found_extent) {
6896                         BUG_ON(iref);
6897                         ret = remove_extent_backref(trans, extent_root, path,
6898                                                     NULL, refs_to_drop,
6899                                                     is_data, &last_ref);
6900                         if (ret) {
6901                                 btrfs_abort_transaction(trans, ret);
6902                                 goto out;
6903                         }
6904                         btrfs_release_path(path);
6905                         path->leave_spinning = 1;
6906
6907                         key.objectid = bytenr;
6908                         key.type = BTRFS_EXTENT_ITEM_KEY;
6909                         key.offset = num_bytes;
6910
6911                         if (!is_data && skinny_metadata) {
6912                                 key.type = BTRFS_METADATA_ITEM_KEY;
6913                                 key.offset = owner_objectid;
6914                         }
6915
6916                         ret = btrfs_search_slot(trans, extent_root,
6917                                                 &key, path, -1, 1);
6918                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6919                                 /*
6920                                  * Couldn't find our skinny metadata item,
6921                                  * see if we have ye olde extent item.
6922                                  */
6923                                 path->slots[0]--;
6924                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6925                                                       path->slots[0]);
6926                                 if (key.objectid == bytenr &&
6927                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6928                                     key.offset == num_bytes)
6929                                         ret = 0;
6930                         }
6931
6932                         if (ret > 0 && skinny_metadata) {
6933                                 skinny_metadata = false;
6934                                 key.objectid = bytenr;
6935                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6936                                 key.offset = num_bytes;
6937                                 btrfs_release_path(path);
6938                                 ret = btrfs_search_slot(trans, extent_root,
6939                                                         &key, path, -1, 1);
6940                         }
6941
6942                         if (ret) {
6943                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6944                                         ret, bytenr);
6945                                 if (ret > 0)
6946                                         btrfs_print_leaf(extent_root,
6947                                                          path->nodes[0]);
6948                         }
6949                         if (ret < 0) {
6950                                 btrfs_abort_transaction(trans, ret);
6951                                 goto out;
6952                         }
6953                         extent_slot = path->slots[0];
6954                 }
6955         } else if (WARN_ON(ret == -ENOENT)) {
6956                 btrfs_print_leaf(extent_root, path->nodes[0]);
6957                 btrfs_err(info,
6958                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6959                         bytenr, parent, root_objectid, owner_objectid,
6960                         owner_offset);
6961                 btrfs_abort_transaction(trans, ret);
6962                 goto out;
6963         } else {
6964                 btrfs_abort_transaction(trans, ret);
6965                 goto out;
6966         }
6967
6968         leaf = path->nodes[0];
6969         item_size = btrfs_item_size_nr(leaf, extent_slot);
6970 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6971         if (item_size < sizeof(*ei)) {
6972                 BUG_ON(found_extent || extent_slot != path->slots[0]);
6973                 ret = convert_extent_item_v0(trans, extent_root, path,
6974                                              owner_objectid, 0);
6975                 if (ret < 0) {
6976                         btrfs_abort_transaction(trans, ret);
6977                         goto out;
6978                 }
6979
6980                 btrfs_release_path(path);
6981                 path->leave_spinning = 1;
6982
6983                 key.objectid = bytenr;
6984                 key.type = BTRFS_EXTENT_ITEM_KEY;
6985                 key.offset = num_bytes;
6986
6987                 ret = btrfs_search_slot(trans, extent_root, &key, path,
6988                                         -1, 1);
6989                 if (ret) {
6990                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6991                                 ret, bytenr);
6992                         btrfs_print_leaf(extent_root, path->nodes[0]);
6993                 }
6994                 if (ret < 0) {
6995                         btrfs_abort_transaction(trans, ret);
6996                         goto out;
6997                 }
6998
6999                 extent_slot = path->slots[0];
7000                 leaf = path->nodes[0];
7001                 item_size = btrfs_item_size_nr(leaf, extent_slot);
7002         }
7003 #endif
7004         BUG_ON(item_size < sizeof(*ei));
7005         ei = btrfs_item_ptr(leaf, extent_slot,
7006                             struct btrfs_extent_item);
7007         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7008             key.type == BTRFS_EXTENT_ITEM_KEY) {
7009                 struct btrfs_tree_block_info *bi;
7010                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7011                 bi = (struct btrfs_tree_block_info *)(ei + 1);
7012                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7013         }
7014
7015         refs = btrfs_extent_refs(leaf, ei);
7016         if (refs < refs_to_drop) {
7017                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
7018                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
7019                 ret = -EINVAL;
7020                 btrfs_abort_transaction(trans, ret);
7021                 goto out;
7022         }
7023         refs -= refs_to_drop;
7024
7025         if (refs > 0) {
7026                 if (extent_op)
7027                         __run_delayed_extent_op(extent_op, leaf, ei);
7028                 /*
7029                  * In the case of inline back ref, reference count will
7030                  * be updated by remove_extent_backref
7031                  */
7032                 if (iref) {
7033                         BUG_ON(!found_extent);
7034                 } else {
7035                         btrfs_set_extent_refs(leaf, ei, refs);
7036                         btrfs_mark_buffer_dirty(leaf);
7037                 }
7038                 if (found_extent) {
7039                         ret = remove_extent_backref(trans, extent_root, path,
7040                                                     iref, refs_to_drop,
7041                                                     is_data, &last_ref);
7042                         if (ret) {
7043                                 btrfs_abort_transaction(trans, ret);
7044                                 goto out;
7045                         }
7046                 }
7047                 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
7048                                  root_objectid);
7049         } else {
7050                 if (found_extent) {
7051                         BUG_ON(is_data && refs_to_drop !=
7052                                extent_data_ref_count(path, iref));
7053                         if (iref) {
7054                                 BUG_ON(path->slots[0] != extent_slot);
7055                         } else {
7056                                 BUG_ON(path->slots[0] != extent_slot + 1);
7057                                 path->slots[0] = extent_slot;
7058                                 num_to_del = 2;
7059                         }
7060                 }
7061
7062                 last_ref = 1;
7063                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7064                                       num_to_del);
7065                 if (ret) {
7066                         btrfs_abort_transaction(trans, ret);
7067                         goto out;
7068                 }
7069                 btrfs_release_path(path);
7070
7071                 if (is_data) {
7072                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
7073                         if (ret) {
7074                                 btrfs_abort_transaction(trans, ret);
7075                                 goto out;
7076                         }
7077                 }
7078
7079                 ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
7080                                              num_bytes);
7081                 if (ret) {
7082                         btrfs_abort_transaction(trans, ret);
7083                         goto out;
7084                 }
7085
7086                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
7087                 if (ret) {
7088                         btrfs_abort_transaction(trans, ret);
7089                         goto out;
7090                 }
7091         }
7092         btrfs_release_path(path);
7093
7094 out:
7095         btrfs_free_path(path);
7096         return ret;
7097 }
7098
7099 /*
7100  * when we free an block, it is possible (and likely) that we free the last
7101  * delayed ref for that extent as well.  This searches the delayed ref tree for
7102  * a given extent, and if there are no other delayed refs to be processed, it
7103  * removes it from the tree.
7104  */
7105 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7106                                       struct btrfs_root *root, u64 bytenr)
7107 {
7108         struct btrfs_delayed_ref_head *head;
7109         struct btrfs_delayed_ref_root *delayed_refs;
7110         int ret = 0;
7111
7112         delayed_refs = &trans->transaction->delayed_refs;
7113         spin_lock(&delayed_refs->lock);
7114         head = btrfs_find_delayed_ref_head(trans, bytenr);
7115         if (!head)
7116                 goto out_delayed_unlock;
7117
7118         spin_lock(&head->lock);
7119         if (!list_empty(&head->ref_list))
7120                 goto out;
7121
7122         if (head->extent_op) {
7123                 if (!head->must_insert_reserved)
7124                         goto out;
7125                 btrfs_free_delayed_extent_op(head->extent_op);
7126                 head->extent_op = NULL;
7127         }
7128
7129         /*
7130          * waiting for the lock here would deadlock.  If someone else has it
7131          * locked they are already in the process of dropping it anyway
7132          */
7133         if (!mutex_trylock(&head->mutex))
7134                 goto out;
7135
7136         /*
7137          * at this point we have a head with no other entries.  Go
7138          * ahead and process it.
7139          */
7140         head->node.in_tree = 0;
7141         rb_erase(&head->href_node, &delayed_refs->href_root);
7142
7143         atomic_dec(&delayed_refs->num_entries);
7144
7145         /*
7146          * we don't take a ref on the node because we're removing it from the
7147          * tree, so we just steal the ref the tree was holding.
7148          */
7149         delayed_refs->num_heads--;
7150         if (head->processing == 0)
7151                 delayed_refs->num_heads_ready--;
7152         head->processing = 0;
7153         spin_unlock(&head->lock);
7154         spin_unlock(&delayed_refs->lock);
7155
7156         BUG_ON(head->extent_op);
7157         if (head->must_insert_reserved)
7158                 ret = 1;
7159
7160         mutex_unlock(&head->mutex);
7161         btrfs_put_delayed_ref(&head->node);
7162         return ret;
7163 out:
7164         spin_unlock(&head->lock);
7165
7166 out_delayed_unlock:
7167         spin_unlock(&delayed_refs->lock);
7168         return 0;
7169 }
7170
7171 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7172                            struct btrfs_root *root,
7173                            struct extent_buffer *buf,
7174                            u64 parent, int last_ref)
7175 {
7176         int pin = 1;
7177         int ret;
7178
7179         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7180                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7181                                         buf->start, buf->len,
7182                                         parent, root->root_key.objectid,
7183                                         btrfs_header_level(buf),
7184                                         BTRFS_DROP_DELAYED_REF, NULL);
7185                 BUG_ON(ret); /* -ENOMEM */
7186         }
7187
7188         if (!last_ref)
7189                 return;
7190
7191         if (btrfs_header_generation(buf) == trans->transid) {
7192                 struct btrfs_block_group_cache *cache;
7193
7194                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7195                         ret = check_ref_cleanup(trans, root, buf->start);
7196                         if (!ret)
7197                                 goto out;
7198                 }
7199
7200                 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
7201
7202                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7203                         pin_down_extent(root, cache, buf->start, buf->len, 1);
7204                         btrfs_put_block_group(cache);
7205                         goto out;
7206                 }
7207
7208                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7209
7210                 btrfs_add_free_space(cache, buf->start, buf->len);
7211                 btrfs_free_reserved_bytes(cache, buf->len, 0);
7212                 btrfs_put_block_group(cache);
7213                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
7214                 pin = 0;
7215         }
7216 out:
7217         if (pin)
7218                 add_pinned_bytes(root->fs_info, buf->len,
7219                                  btrfs_header_level(buf),
7220                                  root->root_key.objectid);
7221
7222         /*
7223          * Deleting the buffer, clear the corrupt flag since it doesn't matter
7224          * anymore.
7225          */
7226         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7227 }
7228
7229 /* Can return -ENOMEM */
7230 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7231                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7232                       u64 owner, u64 offset)
7233 {
7234         int ret;
7235         struct btrfs_fs_info *fs_info = root->fs_info;
7236
7237         if (btrfs_is_testing(fs_info))
7238                 return 0;
7239
7240         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
7241
7242         /*
7243          * tree log blocks never actually go into the extent allocation
7244          * tree, just update pinning info and exit early.
7245          */
7246         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7247                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7248                 /* unlocks the pinned mutex */
7249                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
7250                 ret = 0;
7251         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7252                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7253                                         num_bytes,
7254                                         parent, root_objectid, (int)owner,
7255                                         BTRFS_DROP_DELAYED_REF, NULL);
7256         } else {
7257                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7258                                                 num_bytes,
7259                                                 parent, root_objectid, owner,
7260                                                 offset, 0,
7261                                                 BTRFS_DROP_DELAYED_REF, NULL);
7262         }
7263         return ret;
7264 }
7265
7266 /*
7267  * when we wait for progress in the block group caching, its because
7268  * our allocation attempt failed at least once.  So, we must sleep
7269  * and let some progress happen before we try again.
7270  *
7271  * This function will sleep at least once waiting for new free space to
7272  * show up, and then it will check the block group free space numbers
7273  * for our min num_bytes.  Another option is to have it go ahead
7274  * and look in the rbtree for a free extent of a given size, but this
7275  * is a good start.
7276  *
7277  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7278  * any of the information in this block group.
7279  */
7280 static noinline void
7281 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7282                                 u64 num_bytes)
7283 {
7284         struct btrfs_caching_control *caching_ctl;
7285
7286         caching_ctl = get_caching_control(cache);
7287         if (!caching_ctl)
7288                 return;
7289
7290         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7291                    (cache->free_space_ctl->free_space >= num_bytes));
7292
7293         put_caching_control(caching_ctl);
7294 }
7295
7296 static noinline int
7297 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7298 {
7299         struct btrfs_caching_control *caching_ctl;
7300         int ret = 0;
7301
7302         caching_ctl = get_caching_control(cache);
7303         if (!caching_ctl)
7304                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7305
7306         wait_event(caching_ctl->wait, block_group_cache_done(cache));
7307         if (cache->cached == BTRFS_CACHE_ERROR)
7308                 ret = -EIO;
7309         put_caching_control(caching_ctl);
7310         return ret;
7311 }
7312
7313 int __get_raid_index(u64 flags)
7314 {
7315         if (flags & BTRFS_BLOCK_GROUP_RAID10)
7316                 return BTRFS_RAID_RAID10;
7317         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
7318                 return BTRFS_RAID_RAID1;
7319         else if (flags & BTRFS_BLOCK_GROUP_DUP)
7320                 return BTRFS_RAID_DUP;
7321         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
7322                 return BTRFS_RAID_RAID0;
7323         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
7324                 return BTRFS_RAID_RAID5;
7325         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
7326                 return BTRFS_RAID_RAID6;
7327
7328         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
7329 }
7330
7331 int get_block_group_index(struct btrfs_block_group_cache *cache)
7332 {
7333         return __get_raid_index(cache->flags);
7334 }
7335
7336 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7337         [BTRFS_RAID_RAID10]     = "raid10",
7338         [BTRFS_RAID_RAID1]      = "raid1",
7339         [BTRFS_RAID_DUP]        = "dup",
7340         [BTRFS_RAID_RAID0]      = "raid0",
7341         [BTRFS_RAID_SINGLE]     = "single",
7342         [BTRFS_RAID_RAID5]      = "raid5",
7343         [BTRFS_RAID_RAID6]      = "raid6",
7344 };
7345
7346 static const char *get_raid_name(enum btrfs_raid_types type)
7347 {
7348         if (type >= BTRFS_NR_RAID_TYPES)
7349                 return NULL;
7350
7351         return btrfs_raid_type_names[type];
7352 }
7353
7354 enum btrfs_loop_type {
7355         LOOP_CACHING_NOWAIT = 0,
7356         LOOP_CACHING_WAIT = 1,
7357         LOOP_ALLOC_CHUNK = 2,
7358         LOOP_NO_EMPTY_SIZE = 3,
7359 };
7360
7361 static inline void
7362 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7363                        int delalloc)
7364 {
7365         if (delalloc)
7366                 down_read(&cache->data_rwsem);
7367 }
7368
7369 static inline void
7370 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7371                        int delalloc)
7372 {
7373         btrfs_get_block_group(cache);
7374         if (delalloc)
7375                 down_read(&cache->data_rwsem);
7376 }
7377
7378 static struct btrfs_block_group_cache *
7379 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7380                    struct btrfs_free_cluster *cluster,
7381                    int delalloc)
7382 {
7383         struct btrfs_block_group_cache *used_bg = NULL;
7384
7385         spin_lock(&cluster->refill_lock);
7386         while (1) {
7387                 used_bg = cluster->block_group;
7388                 if (!used_bg)
7389                         return NULL;
7390
7391                 if (used_bg == block_group)
7392                         return used_bg;
7393
7394                 btrfs_get_block_group(used_bg);
7395
7396                 if (!delalloc)
7397                         return used_bg;
7398
7399                 if (down_read_trylock(&used_bg->data_rwsem))
7400                         return used_bg;
7401
7402                 spin_unlock(&cluster->refill_lock);
7403
7404                 down_read(&used_bg->data_rwsem);
7405
7406                 spin_lock(&cluster->refill_lock);
7407                 if (used_bg == cluster->block_group)
7408                         return used_bg;
7409
7410                 up_read(&used_bg->data_rwsem);
7411                 btrfs_put_block_group(used_bg);
7412         }
7413 }
7414
7415 static inline void
7416 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7417                          int delalloc)
7418 {
7419         if (delalloc)
7420                 up_read(&cache->data_rwsem);
7421         btrfs_put_block_group(cache);
7422 }
7423
7424 /*
7425  * walks the btree of allocated extents and find a hole of a given size.
7426  * The key ins is changed to record the hole:
7427  * ins->objectid == start position
7428  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7429  * ins->offset == the size of the hole.
7430  * Any available blocks before search_start are skipped.
7431  *
7432  * If there is no suitable free space, we will record the max size of
7433  * the free space extent currently.
7434  */
7435 static noinline int find_free_extent(struct btrfs_root *orig_root,
7436                                      u64 num_bytes, u64 empty_size,
7437                                      u64 hint_byte, struct btrfs_key *ins,
7438                                      u64 flags, int delalloc)
7439 {
7440         int ret = 0;
7441         struct btrfs_root *root = orig_root->fs_info->extent_root;
7442         struct btrfs_free_cluster *last_ptr = NULL;
7443         struct btrfs_block_group_cache *block_group = NULL;
7444         u64 search_start = 0;
7445         u64 max_extent_size = 0;
7446         u64 empty_cluster = 0;
7447         struct btrfs_space_info *space_info;
7448         int loop = 0;
7449         int index = __get_raid_index(flags);
7450         int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
7451                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
7452         bool failed_cluster_refill = false;
7453         bool failed_alloc = false;
7454         bool use_cluster = true;
7455         bool have_caching_bg = false;
7456         bool orig_have_caching_bg = false;
7457         bool full_search = false;
7458
7459         WARN_ON(num_bytes < root->sectorsize);
7460         ins->type = BTRFS_EXTENT_ITEM_KEY;
7461         ins->objectid = 0;
7462         ins->offset = 0;
7463
7464         trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
7465
7466         space_info = __find_space_info(root->fs_info, flags);
7467         if (!space_info) {
7468                 btrfs_err(root->fs_info, "No space info for %llu", flags);
7469                 return -ENOSPC;
7470         }
7471
7472         /*
7473          * If our free space is heavily fragmented we may not be able to make
7474          * big contiguous allocations, so instead of doing the expensive search
7475          * for free space, simply return ENOSPC with our max_extent_size so we
7476          * can go ahead and search for a more manageable chunk.
7477          *
7478          * If our max_extent_size is large enough for our allocation simply
7479          * disable clustering since we will likely not be able to find enough
7480          * space to create a cluster and induce latency trying.
7481          */
7482         if (unlikely(space_info->max_extent_size)) {
7483                 spin_lock(&space_info->lock);
7484                 if (space_info->max_extent_size &&
7485                     num_bytes > space_info->max_extent_size) {
7486                         ins->offset = space_info->max_extent_size;
7487                         spin_unlock(&space_info->lock);
7488                         return -ENOSPC;
7489                 } else if (space_info->max_extent_size) {
7490                         use_cluster = false;
7491                 }
7492                 spin_unlock(&space_info->lock);
7493         }
7494
7495         last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
7496         if (last_ptr) {
7497                 spin_lock(&last_ptr->lock);
7498                 if (last_ptr->block_group)
7499                         hint_byte = last_ptr->window_start;
7500                 if (last_ptr->fragmented) {
7501                         /*
7502                          * We still set window_start so we can keep track of the
7503                          * last place we found an allocation to try and save
7504                          * some time.
7505                          */
7506                         hint_byte = last_ptr->window_start;
7507                         use_cluster = false;
7508                 }
7509                 spin_unlock(&last_ptr->lock);
7510         }
7511
7512         search_start = max(search_start, first_logical_byte(root, 0));
7513         search_start = max(search_start, hint_byte);
7514         if (search_start == hint_byte) {
7515                 block_group = btrfs_lookup_block_group(root->fs_info,
7516                                                        search_start);
7517                 /*
7518                  * we don't want to use the block group if it doesn't match our
7519                  * allocation bits, or if its not cached.
7520                  *
7521                  * However if we are re-searching with an ideal block group
7522                  * picked out then we don't care that the block group is cached.
7523                  */
7524                 if (block_group && block_group_bits(block_group, flags) &&
7525                     block_group->cached != BTRFS_CACHE_NO) {
7526                         down_read(&space_info->groups_sem);
7527                         if (list_empty(&block_group->list) ||
7528                             block_group->ro) {
7529                                 /*
7530                                  * someone is removing this block group,
7531                                  * we can't jump into the have_block_group
7532                                  * target because our list pointers are not
7533                                  * valid
7534                                  */
7535                                 btrfs_put_block_group(block_group);
7536                                 up_read(&space_info->groups_sem);
7537                         } else {
7538                                 index = get_block_group_index(block_group);
7539                                 btrfs_lock_block_group(block_group, delalloc);
7540                                 goto have_block_group;
7541                         }
7542                 } else if (block_group) {
7543                         btrfs_put_block_group(block_group);
7544                 }
7545         }
7546 search:
7547         have_caching_bg = false;
7548         if (index == 0 || index == __get_raid_index(flags))
7549                 full_search = true;
7550         down_read(&space_info->groups_sem);
7551         list_for_each_entry(block_group, &space_info->block_groups[index],
7552                             list) {
7553                 u64 offset;
7554                 int cached;
7555
7556                 btrfs_grab_block_group(block_group, delalloc);
7557                 search_start = block_group->key.objectid;
7558
7559                 /*
7560                  * this can happen if we end up cycling through all the
7561                  * raid types, but we want to make sure we only allocate
7562                  * for the proper type.
7563                  */
7564                 if (!block_group_bits(block_group, flags)) {
7565                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
7566                                 BTRFS_BLOCK_GROUP_RAID1 |
7567                                 BTRFS_BLOCK_GROUP_RAID5 |
7568                                 BTRFS_BLOCK_GROUP_RAID6 |
7569                                 BTRFS_BLOCK_GROUP_RAID10;
7570
7571                         /*
7572                          * if they asked for extra copies and this block group
7573                          * doesn't provide them, bail.  This does allow us to
7574                          * fill raid0 from raid1.
7575                          */
7576                         if ((flags & extra) && !(block_group->flags & extra))
7577                                 goto loop;
7578                 }
7579
7580 have_block_group:
7581                 cached = block_group_cache_done(block_group);
7582                 if (unlikely(!cached)) {
7583                         have_caching_bg = true;
7584                         ret = cache_block_group(block_group, 0);
7585                         BUG_ON(ret < 0);
7586                         ret = 0;
7587                 }
7588
7589                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7590                         goto loop;
7591                 if (unlikely(block_group->ro))
7592                         goto loop;
7593
7594                 /*
7595                  * Ok we want to try and use the cluster allocator, so
7596                  * lets look there
7597                  */
7598                 if (last_ptr && use_cluster) {
7599                         struct btrfs_block_group_cache *used_block_group;
7600                         unsigned long aligned_cluster;
7601                         /*
7602                          * the refill lock keeps out other
7603                          * people trying to start a new cluster
7604                          */
7605                         used_block_group = btrfs_lock_cluster(block_group,
7606                                                               last_ptr,
7607                                                               delalloc);
7608                         if (!used_block_group)
7609                                 goto refill_cluster;
7610
7611                         if (used_block_group != block_group &&
7612                             (used_block_group->ro ||
7613                              !block_group_bits(used_block_group, flags)))
7614                                 goto release_cluster;
7615
7616                         offset = btrfs_alloc_from_cluster(used_block_group,
7617                                                 last_ptr,
7618                                                 num_bytes,
7619                                                 used_block_group->key.objectid,
7620                                                 &max_extent_size);
7621                         if (offset) {
7622                                 /* we have a block, we're done */
7623                                 spin_unlock(&last_ptr->refill_lock);
7624                                 trace_btrfs_reserve_extent_cluster(root,
7625                                                 used_block_group,
7626                                                 search_start, num_bytes);
7627                                 if (used_block_group != block_group) {
7628                                         btrfs_release_block_group(block_group,
7629                                                                   delalloc);
7630                                         block_group = used_block_group;
7631                                 }
7632                                 goto checks;
7633                         }
7634
7635                         WARN_ON(last_ptr->block_group != used_block_group);
7636 release_cluster:
7637                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7638                          * set up a new clusters, so lets just skip it
7639                          * and let the allocator find whatever block
7640                          * it can find.  If we reach this point, we
7641                          * will have tried the cluster allocator
7642                          * plenty of times and not have found
7643                          * anything, so we are likely way too
7644                          * fragmented for the clustering stuff to find
7645                          * anything.
7646                          *
7647                          * However, if the cluster is taken from the
7648                          * current block group, release the cluster
7649                          * first, so that we stand a better chance of
7650                          * succeeding in the unclustered
7651                          * allocation.  */
7652                         if (loop >= LOOP_NO_EMPTY_SIZE &&
7653                             used_block_group != block_group) {
7654                                 spin_unlock(&last_ptr->refill_lock);
7655                                 btrfs_release_block_group(used_block_group,
7656                                                           delalloc);
7657                                 goto unclustered_alloc;
7658                         }
7659
7660                         /*
7661                          * this cluster didn't work out, free it and
7662                          * start over
7663                          */
7664                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7665
7666                         if (used_block_group != block_group)
7667                                 btrfs_release_block_group(used_block_group,
7668                                                           delalloc);
7669 refill_cluster:
7670                         if (loop >= LOOP_NO_EMPTY_SIZE) {
7671                                 spin_unlock(&last_ptr->refill_lock);
7672                                 goto unclustered_alloc;
7673                         }
7674
7675                         aligned_cluster = max_t(unsigned long,
7676                                                 empty_cluster + empty_size,
7677                                               block_group->full_stripe_len);
7678
7679                         /* allocate a cluster in this block group */
7680                         ret = btrfs_find_space_cluster(root, block_group,
7681                                                        last_ptr, search_start,
7682                                                        num_bytes,
7683                                                        aligned_cluster);
7684                         if (ret == 0) {
7685                                 /*
7686                                  * now pull our allocation out of this
7687                                  * cluster
7688                                  */
7689                                 offset = btrfs_alloc_from_cluster(block_group,
7690                                                         last_ptr,
7691                                                         num_bytes,
7692                                                         search_start,
7693                                                         &max_extent_size);
7694                                 if (offset) {
7695                                         /* we found one, proceed */
7696                                         spin_unlock(&last_ptr->refill_lock);
7697                                         trace_btrfs_reserve_extent_cluster(root,
7698                                                 block_group, search_start,
7699                                                 num_bytes);
7700                                         goto checks;
7701                                 }
7702                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
7703                                    && !failed_cluster_refill) {
7704                                 spin_unlock(&last_ptr->refill_lock);
7705
7706                                 failed_cluster_refill = true;
7707                                 wait_block_group_cache_progress(block_group,
7708                                        num_bytes + empty_cluster + empty_size);
7709                                 goto have_block_group;
7710                         }
7711
7712                         /*
7713                          * at this point we either didn't find a cluster
7714                          * or we weren't able to allocate a block from our
7715                          * cluster.  Free the cluster we've been trying
7716                          * to use, and go to the next block group
7717                          */
7718                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7719                         spin_unlock(&last_ptr->refill_lock);
7720                         goto loop;
7721                 }
7722
7723 unclustered_alloc:
7724                 /*
7725                  * We are doing an unclustered alloc, set the fragmented flag so
7726                  * we don't bother trying to setup a cluster again until we get
7727                  * more space.
7728                  */
7729                 if (unlikely(last_ptr)) {
7730                         spin_lock(&last_ptr->lock);
7731                         last_ptr->fragmented = 1;
7732                         spin_unlock(&last_ptr->lock);
7733                 }
7734                 spin_lock(&block_group->free_space_ctl->tree_lock);
7735                 if (cached &&
7736                     block_group->free_space_ctl->free_space <
7737                     num_bytes + empty_cluster + empty_size) {
7738                         if (block_group->free_space_ctl->free_space >
7739                             max_extent_size)
7740                                 max_extent_size =
7741                                         block_group->free_space_ctl->free_space;
7742                         spin_unlock(&block_group->free_space_ctl->tree_lock);
7743                         goto loop;
7744                 }
7745                 spin_unlock(&block_group->free_space_ctl->tree_lock);
7746
7747                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7748                                                     num_bytes, empty_size,
7749                                                     &max_extent_size);
7750                 /*
7751                  * If we didn't find a chunk, and we haven't failed on this
7752                  * block group before, and this block group is in the middle of
7753                  * caching and we are ok with waiting, then go ahead and wait
7754                  * for progress to be made, and set failed_alloc to true.
7755                  *
7756                  * If failed_alloc is true then we've already waited on this
7757                  * block group once and should move on to the next block group.
7758                  */
7759                 if (!offset && !failed_alloc && !cached &&
7760                     loop > LOOP_CACHING_NOWAIT) {
7761                         wait_block_group_cache_progress(block_group,
7762                                                 num_bytes + empty_size);
7763                         failed_alloc = true;
7764                         goto have_block_group;
7765                 } else if (!offset) {
7766                         goto loop;
7767                 }
7768 checks:
7769                 search_start = ALIGN(offset, root->stripesize);
7770
7771                 /* move on to the next group */
7772                 if (search_start + num_bytes >
7773                     block_group->key.objectid + block_group->key.offset) {
7774                         btrfs_add_free_space(block_group, offset, num_bytes);
7775                         goto loop;
7776                 }
7777
7778                 if (offset < search_start)
7779                         btrfs_add_free_space(block_group, offset,
7780                                              search_start - offset);
7781                 BUG_ON(offset > search_start);
7782
7783                 ret = btrfs_add_reserved_bytes(block_group, num_bytes,
7784                                 alloc_type, delalloc);
7785                 if (ret == -EAGAIN) {
7786                         btrfs_add_free_space(block_group, offset, num_bytes);
7787                         goto loop;
7788                 }
7789                 btrfs_inc_block_group_reservations(block_group);
7790
7791                 /* we are all good, lets return */
7792                 ins->objectid = search_start;
7793                 ins->offset = num_bytes;
7794
7795                 trace_btrfs_reserve_extent(orig_root, block_group,
7796                                            search_start, num_bytes);
7797                 btrfs_release_block_group(block_group, delalloc);
7798                 break;
7799 loop:
7800                 failed_cluster_refill = false;
7801                 failed_alloc = false;
7802                 BUG_ON(index != get_block_group_index(block_group));
7803                 btrfs_release_block_group(block_group, delalloc);
7804         }
7805         up_read(&space_info->groups_sem);
7806
7807         if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7808                 && !orig_have_caching_bg)
7809                 orig_have_caching_bg = true;
7810
7811         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7812                 goto search;
7813
7814         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7815                 goto search;
7816
7817         /*
7818          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7819          *                      caching kthreads as we move along
7820          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7821          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7822          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7823          *                      again
7824          */
7825         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7826                 index = 0;
7827                 if (loop == LOOP_CACHING_NOWAIT) {
7828                         /*
7829                          * We want to skip the LOOP_CACHING_WAIT step if we
7830                          * don't have any uncached bgs and we've already done a
7831                          * full search through.
7832                          */
7833                         if (orig_have_caching_bg || !full_search)
7834                                 loop = LOOP_CACHING_WAIT;
7835                         else
7836                                 loop = LOOP_ALLOC_CHUNK;
7837                 } else {
7838                         loop++;
7839                 }
7840
7841                 if (loop == LOOP_ALLOC_CHUNK) {
7842                         struct btrfs_trans_handle *trans;
7843                         int exist = 0;
7844
7845                         trans = current->journal_info;
7846                         if (trans)
7847                                 exist = 1;
7848                         else
7849                                 trans = btrfs_join_transaction(root);
7850
7851                         if (IS_ERR(trans)) {
7852                                 ret = PTR_ERR(trans);
7853                                 goto out;
7854                         }
7855
7856                         ret = do_chunk_alloc(trans, root, flags,
7857                                              CHUNK_ALLOC_FORCE);
7858
7859                         /*
7860                          * If we can't allocate a new chunk we've already looped
7861                          * through at least once, move on to the NO_EMPTY_SIZE
7862                          * case.
7863                          */
7864                         if (ret == -ENOSPC)
7865                                 loop = LOOP_NO_EMPTY_SIZE;
7866
7867                         /*
7868                          * Do not bail out on ENOSPC since we
7869                          * can do more things.
7870                          */
7871                         if (ret < 0 && ret != -ENOSPC)
7872                                 btrfs_abort_transaction(trans, ret);
7873                         else
7874                                 ret = 0;
7875                         if (!exist)
7876                                 btrfs_end_transaction(trans, root);
7877                         if (ret)
7878                                 goto out;
7879                 }
7880
7881                 if (loop == LOOP_NO_EMPTY_SIZE) {
7882                         /*
7883                          * Don't loop again if we already have no empty_size and
7884                          * no empty_cluster.
7885                          */
7886                         if (empty_size == 0 &&
7887                             empty_cluster == 0) {
7888                                 ret = -ENOSPC;
7889                                 goto out;
7890                         }
7891                         empty_size = 0;
7892                         empty_cluster = 0;
7893                 }
7894
7895                 goto search;
7896         } else if (!ins->objectid) {
7897                 ret = -ENOSPC;
7898         } else if (ins->objectid) {
7899                 if (!use_cluster && last_ptr) {
7900                         spin_lock(&last_ptr->lock);
7901                         last_ptr->window_start = ins->objectid;
7902                         spin_unlock(&last_ptr->lock);
7903                 }
7904                 ret = 0;
7905         }
7906 out:
7907         if (ret == -ENOSPC) {
7908                 spin_lock(&space_info->lock);
7909                 space_info->max_extent_size = max_extent_size;
7910                 spin_unlock(&space_info->lock);
7911                 ins->offset = max_extent_size;
7912         }
7913         return ret;
7914 }
7915
7916 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7917                             int dump_block_groups)
7918 {
7919         struct btrfs_block_group_cache *cache;
7920         int index = 0;
7921
7922         spin_lock(&info->lock);
7923         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7924                info->flags,
7925                info->total_bytes - info->bytes_used - info->bytes_pinned -
7926                info->bytes_reserved - info->bytes_readonly -
7927                info->bytes_may_use, (info->full) ? "" : "not ");
7928         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7929                "reserved=%llu, may_use=%llu, readonly=%llu\n",
7930                info->total_bytes, info->bytes_used, info->bytes_pinned,
7931                info->bytes_reserved, info->bytes_may_use,
7932                info->bytes_readonly);
7933         spin_unlock(&info->lock);
7934
7935         if (!dump_block_groups)
7936                 return;
7937
7938         down_read(&info->groups_sem);
7939 again:
7940         list_for_each_entry(cache, &info->block_groups[index], list) {
7941                 spin_lock(&cache->lock);
7942                 printk(KERN_INFO "BTRFS: "
7943                            "block group %llu has %llu bytes, "
7944                            "%llu used %llu pinned %llu reserved %s\n",
7945                        cache->key.objectid, cache->key.offset,
7946                        btrfs_block_group_used(&cache->item), cache->pinned,
7947                        cache->reserved, cache->ro ? "[readonly]" : "");
7948                 btrfs_dump_free_space(cache, bytes);
7949                 spin_unlock(&cache->lock);
7950         }
7951         if (++index < BTRFS_NR_RAID_TYPES)
7952                 goto again;
7953         up_read(&info->groups_sem);
7954 }
7955
7956 int btrfs_reserve_extent(struct btrfs_root *root,
7957                          u64 num_bytes, u64 min_alloc_size,
7958                          u64 empty_size, u64 hint_byte,
7959                          struct btrfs_key *ins, int is_data, int delalloc)
7960 {
7961         bool final_tried = num_bytes == min_alloc_size;
7962         u64 flags;
7963         int ret;
7964
7965         flags = btrfs_get_alloc_profile(root, is_data);
7966 again:
7967         WARN_ON(num_bytes < root->sectorsize);
7968         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7969                                flags, delalloc);
7970         if (!ret && !is_data) {
7971                 btrfs_dec_block_group_reservations(root->fs_info,
7972                                                    ins->objectid);
7973         } else if (ret == -ENOSPC) {
7974                 if (!final_tried && ins->offset) {
7975                         num_bytes = min(num_bytes >> 1, ins->offset);
7976                         num_bytes = round_down(num_bytes, root->sectorsize);
7977                         num_bytes = max(num_bytes, min_alloc_size);
7978                         if (num_bytes == min_alloc_size)
7979                                 final_tried = true;
7980                         goto again;
7981                 } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
7982                         struct btrfs_space_info *sinfo;
7983
7984                         sinfo = __find_space_info(root->fs_info, flags);
7985                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7986                                 flags, num_bytes);
7987                         if (sinfo)
7988                                 dump_space_info(sinfo, num_bytes, 1);
7989                 }
7990         }
7991
7992         return ret;
7993 }
7994
7995 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7996                                         u64 start, u64 len,
7997                                         int pin, int delalloc)
7998 {
7999         struct btrfs_block_group_cache *cache;
8000         int ret = 0;
8001
8002         cache = btrfs_lookup_block_group(root->fs_info, start);
8003         if (!cache) {
8004                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
8005                         start);
8006                 return -ENOSPC;
8007         }
8008
8009         if (pin)
8010                 pin_down_extent(root, cache, start, len, 1);
8011         else {
8012                 if (btrfs_test_opt(root->fs_info, DISCARD))
8013                         ret = btrfs_discard_extent(root, start, len, NULL);
8014                 btrfs_add_free_space(cache, start, len);
8015                 btrfs_free_reserved_bytes(cache, len, delalloc);
8016                 trace_btrfs_reserved_extent_free(root, start, len);
8017         }
8018
8019         btrfs_put_block_group(cache);
8020         return ret;
8021 }
8022
8023 int btrfs_free_reserved_extent(struct btrfs_root *root,
8024                                u64 start, u64 len, int delalloc)
8025 {
8026         return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
8027 }
8028
8029 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
8030                                        u64 start, u64 len)
8031 {
8032         return __btrfs_free_reserved_extent(root, start, len, 1, 0);
8033 }
8034
8035 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8036                                       struct btrfs_root *root,
8037                                       u64 parent, u64 root_objectid,
8038                                       u64 flags, u64 owner, u64 offset,
8039                                       struct btrfs_key *ins, int ref_mod)
8040 {
8041         int ret;
8042         struct btrfs_fs_info *fs_info = root->fs_info;
8043         struct btrfs_extent_item *extent_item;
8044         struct btrfs_extent_inline_ref *iref;
8045         struct btrfs_path *path;
8046         struct extent_buffer *leaf;
8047         int type;
8048         u32 size;
8049
8050         if (parent > 0)
8051                 type = BTRFS_SHARED_DATA_REF_KEY;
8052         else
8053                 type = BTRFS_EXTENT_DATA_REF_KEY;
8054
8055         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8056
8057         path = btrfs_alloc_path();
8058         if (!path)
8059                 return -ENOMEM;
8060
8061         path->leave_spinning = 1;
8062         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8063                                       ins, size);
8064         if (ret) {
8065                 btrfs_free_path(path);
8066                 return ret;
8067         }
8068
8069         leaf = path->nodes[0];
8070         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8071                                      struct btrfs_extent_item);
8072         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8073         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8074         btrfs_set_extent_flags(leaf, extent_item,
8075                                flags | BTRFS_EXTENT_FLAG_DATA);
8076
8077         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8078         btrfs_set_extent_inline_ref_type(leaf, iref, type);
8079         if (parent > 0) {
8080                 struct btrfs_shared_data_ref *ref;
8081                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
8082                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8083                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8084         } else {
8085                 struct btrfs_extent_data_ref *ref;
8086                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8087                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8088                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8089                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8090                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8091         }
8092
8093         btrfs_mark_buffer_dirty(path->nodes[0]);
8094         btrfs_free_path(path);
8095
8096         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8097                                           ins->offset);
8098         if (ret)
8099                 return ret;
8100
8101         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
8102         if (ret) { /* -ENOENT, logic error */
8103                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8104                         ins->objectid, ins->offset);
8105                 BUG();
8106         }
8107         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
8108         return ret;
8109 }
8110
8111 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8112                                      struct btrfs_root *root,
8113                                      u64 parent, u64 root_objectid,
8114                                      u64 flags, struct btrfs_disk_key *key,
8115                                      int level, struct btrfs_key *ins)
8116 {
8117         int ret;
8118         struct btrfs_fs_info *fs_info = root->fs_info;
8119         struct btrfs_extent_item *extent_item;
8120         struct btrfs_tree_block_info *block_info;
8121         struct btrfs_extent_inline_ref *iref;
8122         struct btrfs_path *path;
8123         struct extent_buffer *leaf;
8124         u32 size = sizeof(*extent_item) + sizeof(*iref);
8125         u64 num_bytes = ins->offset;
8126         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8127                                                  SKINNY_METADATA);
8128
8129         if (!skinny_metadata)
8130                 size += sizeof(*block_info);
8131
8132         path = btrfs_alloc_path();
8133         if (!path) {
8134                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8135                                                    root->nodesize);
8136                 return -ENOMEM;
8137         }
8138
8139         path->leave_spinning = 1;
8140         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8141                                       ins, size);
8142         if (ret) {
8143                 btrfs_free_path(path);
8144                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8145                                                    root->nodesize);
8146                 return ret;
8147         }
8148
8149         leaf = path->nodes[0];
8150         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8151                                      struct btrfs_extent_item);
8152         btrfs_set_extent_refs(leaf, extent_item, 1);
8153         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8154         btrfs_set_extent_flags(leaf, extent_item,
8155                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8156
8157         if (skinny_metadata) {
8158                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8159                 num_bytes = root->nodesize;
8160         } else {
8161                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8162                 btrfs_set_tree_block_key(leaf, block_info, key);
8163                 btrfs_set_tree_block_level(leaf, block_info, level);
8164                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8165         }
8166
8167         if (parent > 0) {
8168                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8169                 btrfs_set_extent_inline_ref_type(leaf, iref,
8170                                                  BTRFS_SHARED_BLOCK_REF_KEY);
8171                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8172         } else {
8173                 btrfs_set_extent_inline_ref_type(leaf, iref,
8174                                                  BTRFS_TREE_BLOCK_REF_KEY);
8175                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8176         }
8177
8178         btrfs_mark_buffer_dirty(leaf);
8179         btrfs_free_path(path);
8180
8181         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8182                                           num_bytes);
8183         if (ret)
8184                 return ret;
8185
8186         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
8187                                  1);
8188         if (ret) { /* -ENOENT, logic error */
8189                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8190                         ins->objectid, ins->offset);
8191                 BUG();
8192         }
8193
8194         trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
8195         return ret;
8196 }
8197
8198 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8199                                      struct btrfs_root *root,
8200                                      u64 root_objectid, u64 owner,
8201                                      u64 offset, u64 ram_bytes,
8202                                      struct btrfs_key *ins)
8203 {
8204         int ret;
8205
8206         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
8207
8208         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
8209                                          ins->offset, 0,
8210                                          root_objectid, owner, offset,
8211                                          ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
8212                                          NULL);
8213         return ret;
8214 }
8215
8216 /*
8217  * this is used by the tree logging recovery code.  It records that
8218  * an extent has been allocated and makes sure to clear the free
8219  * space cache bits as well
8220  */
8221 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8222                                    struct btrfs_root *root,
8223                                    u64 root_objectid, u64 owner, u64 offset,
8224                                    struct btrfs_key *ins)
8225 {
8226         int ret;
8227         struct btrfs_block_group_cache *block_group;
8228
8229         /*
8230          * Mixed block groups will exclude before processing the log so we only
8231          * need to do the exclude dance if this fs isn't mixed.
8232          */
8233         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
8234                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
8235                 if (ret)
8236                         return ret;
8237         }
8238
8239         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
8240         if (!block_group)
8241                 return -EINVAL;
8242
8243         ret = btrfs_add_reserved_bytes(block_group, ins->offset,
8244                         RESERVE_ALLOC_NO_ACCOUNT, 0);
8245         BUG_ON(ret); /* logic error */
8246         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
8247                                          0, owner, offset, ins, 1);
8248         btrfs_put_block_group(block_group);
8249         return ret;
8250 }
8251
8252 static struct extent_buffer *
8253 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8254                       u64 bytenr, int level)
8255 {
8256         struct extent_buffer *buf;
8257
8258         buf = btrfs_find_create_tree_block(root, bytenr);
8259         if (IS_ERR(buf))
8260                 return buf;
8261
8262         btrfs_set_header_generation(buf, trans->transid);
8263         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8264         btrfs_tree_lock(buf);
8265         clean_tree_block(trans, root->fs_info, buf);
8266         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8267
8268         btrfs_set_lock_blocking(buf);
8269         set_extent_buffer_uptodate(buf);
8270
8271         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8272                 buf->log_index = root->log_transid % 2;
8273                 /*
8274                  * we allow two log transactions at a time, use different
8275                  * EXENT bit to differentiate dirty pages.
8276                  */
8277                 if (buf->log_index == 0)
8278                         set_extent_dirty(&root->dirty_log_pages, buf->start,
8279                                         buf->start + buf->len - 1, GFP_NOFS);
8280                 else
8281                         set_extent_new(&root->dirty_log_pages, buf->start,
8282                                         buf->start + buf->len - 1);
8283         } else {
8284                 buf->log_index = -1;
8285                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8286                          buf->start + buf->len - 1, GFP_NOFS);
8287         }
8288         trans->dirty = true;
8289         /* this returns a buffer locked for blocking */
8290         return buf;
8291 }
8292
8293 static struct btrfs_block_rsv *
8294 use_block_rsv(struct btrfs_trans_handle *trans,
8295               struct btrfs_root *root, u32 blocksize)
8296 {
8297         struct btrfs_block_rsv *block_rsv;
8298         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
8299         int ret;
8300         bool global_updated = false;
8301
8302         block_rsv = get_block_rsv(trans, root);
8303
8304         if (unlikely(block_rsv->size == 0))
8305                 goto try_reserve;
8306 again:
8307         ret = block_rsv_use_bytes(block_rsv, blocksize);
8308         if (!ret)
8309                 return block_rsv;
8310
8311         if (block_rsv->failfast)
8312                 return ERR_PTR(ret);
8313
8314         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8315                 global_updated = true;
8316                 update_global_block_rsv(root->fs_info);
8317                 goto again;
8318         }
8319
8320         if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
8321                 static DEFINE_RATELIMIT_STATE(_rs,
8322                                 DEFAULT_RATELIMIT_INTERVAL * 10,
8323                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
8324                 if (__ratelimit(&_rs))
8325                         WARN(1, KERN_DEBUG
8326                                 "BTRFS: block rsv returned %d\n", ret);
8327         }
8328 try_reserve:
8329         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8330                                      BTRFS_RESERVE_NO_FLUSH);
8331         if (!ret)
8332                 return block_rsv;
8333         /*
8334          * If we couldn't reserve metadata bytes try and use some from
8335          * the global reserve if its space type is the same as the global
8336          * reservation.
8337          */
8338         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8339             block_rsv->space_info == global_rsv->space_info) {
8340                 ret = block_rsv_use_bytes(global_rsv, blocksize);
8341                 if (!ret)
8342                         return global_rsv;
8343         }
8344         return ERR_PTR(ret);
8345 }
8346
8347 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8348                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
8349 {
8350         block_rsv_add_bytes(block_rsv, blocksize, 0);
8351         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
8352 }
8353
8354 /*
8355  * finds a free extent and does all the dirty work required for allocation
8356  * returns the tree buffer or an ERR_PTR on error.
8357  */
8358 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8359                                         struct btrfs_root *root,
8360                                         u64 parent, u64 root_objectid,
8361                                         struct btrfs_disk_key *key, int level,
8362                                         u64 hint, u64 empty_size)
8363 {
8364         struct btrfs_key ins;
8365         struct btrfs_block_rsv *block_rsv;
8366         struct extent_buffer *buf;
8367         struct btrfs_delayed_extent_op *extent_op;
8368         u64 flags = 0;
8369         int ret;
8370         u32 blocksize = root->nodesize;
8371         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8372                                                  SKINNY_METADATA);
8373
8374 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8375         if (btrfs_is_testing(root->fs_info)) {
8376                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8377                                             level);
8378                 if (!IS_ERR(buf))
8379                         root->alloc_bytenr += blocksize;
8380                 return buf;
8381         }
8382 #endif
8383
8384         block_rsv = use_block_rsv(trans, root, blocksize);
8385         if (IS_ERR(block_rsv))
8386                 return ERR_CAST(block_rsv);
8387
8388         ret = btrfs_reserve_extent(root, blocksize, blocksize,
8389                                    empty_size, hint, &ins, 0, 0);
8390         if (ret)
8391                 goto out_unuse;
8392
8393         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8394         if (IS_ERR(buf)) {
8395                 ret = PTR_ERR(buf);
8396                 goto out_free_reserved;
8397         }
8398
8399         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8400                 if (parent == 0)
8401                         parent = ins.objectid;
8402                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8403         } else
8404                 BUG_ON(parent > 0);
8405
8406         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8407                 extent_op = btrfs_alloc_delayed_extent_op();
8408                 if (!extent_op) {
8409                         ret = -ENOMEM;
8410                         goto out_free_buf;
8411                 }
8412                 if (key)
8413                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8414                 else
8415                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8416                 extent_op->flags_to_set = flags;
8417                 extent_op->update_key = skinny_metadata ? false : true;
8418                 extent_op->update_flags = true;
8419                 extent_op->is_data = false;
8420                 extent_op->level = level;
8421
8422                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
8423                                                  ins.objectid, ins.offset,
8424                                                  parent, root_objectid, level,
8425                                                  BTRFS_ADD_DELAYED_EXTENT,
8426                                                  extent_op);
8427                 if (ret)
8428                         goto out_free_delayed;
8429         }
8430         return buf;
8431
8432 out_free_delayed:
8433         btrfs_free_delayed_extent_op(extent_op);
8434 out_free_buf:
8435         free_extent_buffer(buf);
8436 out_free_reserved:
8437         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
8438 out_unuse:
8439         unuse_block_rsv(root->fs_info, block_rsv, blocksize);
8440         return ERR_PTR(ret);
8441 }
8442
8443 struct walk_control {
8444         u64 refs[BTRFS_MAX_LEVEL];
8445         u64 flags[BTRFS_MAX_LEVEL];
8446         struct btrfs_key update_progress;
8447         int stage;
8448         int level;
8449         int shared_level;
8450         int update_ref;
8451         int keep_locks;
8452         int reada_slot;
8453         int reada_count;
8454         int for_reloc;
8455 };
8456
8457 #define DROP_REFERENCE  1
8458 #define UPDATE_BACKREF  2
8459
8460 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8461                                      struct btrfs_root *root,
8462                                      struct walk_control *wc,
8463                                      struct btrfs_path *path)
8464 {
8465         u64 bytenr;
8466         u64 generation;
8467         u64 refs;
8468         u64 flags;
8469         u32 nritems;
8470         u32 blocksize;
8471         struct btrfs_key key;
8472         struct extent_buffer *eb;
8473         int ret;
8474         int slot;
8475         int nread = 0;
8476
8477         if (path->slots[wc->level] < wc->reada_slot) {
8478                 wc->reada_count = wc->reada_count * 2 / 3;
8479                 wc->reada_count = max(wc->reada_count, 2);
8480         } else {
8481                 wc->reada_count = wc->reada_count * 3 / 2;
8482                 wc->reada_count = min_t(int, wc->reada_count,
8483                                         BTRFS_NODEPTRS_PER_BLOCK(root));
8484         }
8485
8486         eb = path->nodes[wc->level];
8487         nritems = btrfs_header_nritems(eb);
8488         blocksize = root->nodesize;
8489
8490         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8491                 if (nread >= wc->reada_count)
8492                         break;
8493
8494                 cond_resched();
8495                 bytenr = btrfs_node_blockptr(eb, slot);
8496                 generation = btrfs_node_ptr_generation(eb, slot);
8497
8498                 if (slot == path->slots[wc->level])
8499                         goto reada;
8500
8501                 if (wc->stage == UPDATE_BACKREF &&
8502                     generation <= root->root_key.offset)
8503                         continue;
8504
8505                 /* We don't lock the tree block, it's OK to be racy here */
8506                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
8507                                                wc->level - 1, 1, &refs,
8508                                                &flags);
8509                 /* We don't care about errors in readahead. */
8510                 if (ret < 0)
8511                         continue;
8512                 BUG_ON(refs == 0);
8513
8514                 if (wc->stage == DROP_REFERENCE) {
8515                         if (refs == 1)
8516                                 goto reada;
8517
8518                         if (wc->level == 1 &&
8519                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8520                                 continue;
8521                         if (!wc->update_ref ||
8522                             generation <= root->root_key.offset)
8523                                 continue;
8524                         btrfs_node_key_to_cpu(eb, &key, slot);
8525                         ret = btrfs_comp_cpu_keys(&key,
8526                                                   &wc->update_progress);
8527                         if (ret < 0)
8528                                 continue;
8529                 } else {
8530                         if (wc->level == 1 &&
8531                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8532                                 continue;
8533                 }
8534 reada:
8535                 readahead_tree_block(root, bytenr);
8536                 nread++;
8537         }
8538         wc->reada_slot = slot;
8539 }
8540
8541 static int account_leaf_items(struct btrfs_trans_handle *trans,
8542                               struct btrfs_root *root,
8543                               struct extent_buffer *eb)
8544 {
8545         int nr = btrfs_header_nritems(eb);
8546         int i, extent_type, ret;
8547         struct btrfs_key key;
8548         struct btrfs_file_extent_item *fi;
8549         u64 bytenr, num_bytes;
8550
8551         /* We can be called directly from walk_up_proc() */
8552         if (!root->fs_info->quota_enabled)
8553                 return 0;
8554
8555         for (i = 0; i < nr; i++) {
8556                 btrfs_item_key_to_cpu(eb, &key, i);
8557
8558                 if (key.type != BTRFS_EXTENT_DATA_KEY)
8559                         continue;
8560
8561                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
8562                 /* filter out non qgroup-accountable extents  */
8563                 extent_type = btrfs_file_extent_type(eb, fi);
8564
8565                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
8566                         continue;
8567
8568                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
8569                 if (!bytenr)
8570                         continue;
8571
8572                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8573
8574                 ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
8575                                 bytenr, num_bytes, GFP_NOFS);
8576                 if (ret)
8577                         return ret;
8578         }
8579         return 0;
8580 }
8581
8582 /*
8583  * Walk up the tree from the bottom, freeing leaves and any interior
8584  * nodes which have had all slots visited. If a node (leaf or
8585  * interior) is freed, the node above it will have it's slot
8586  * incremented. The root node will never be freed.
8587  *
8588  * At the end of this function, we should have a path which has all
8589  * slots incremented to the next position for a search. If we need to
8590  * read a new node it will be NULL and the node above it will have the
8591  * correct slot selected for a later read.
8592  *
8593  * If we increment the root nodes slot counter past the number of
8594  * elements, 1 is returned to signal completion of the search.
8595  */
8596 static int adjust_slots_upwards(struct btrfs_root *root,
8597                                 struct btrfs_path *path, int root_level)
8598 {
8599         int level = 0;
8600         int nr, slot;
8601         struct extent_buffer *eb;
8602
8603         if (root_level == 0)
8604                 return 1;
8605
8606         while (level <= root_level) {
8607                 eb = path->nodes[level];
8608                 nr = btrfs_header_nritems(eb);
8609                 path->slots[level]++;
8610                 slot = path->slots[level];
8611                 if (slot >= nr || level == 0) {
8612                         /*
8613                          * Don't free the root -  we will detect this
8614                          * condition after our loop and return a
8615                          * positive value for caller to stop walking the tree.
8616                          */
8617                         if (level != root_level) {
8618                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8619                                 path->locks[level] = 0;
8620
8621                                 free_extent_buffer(eb);
8622                                 path->nodes[level] = NULL;
8623                                 path->slots[level] = 0;
8624                         }
8625                 } else {
8626                         /*
8627                          * We have a valid slot to walk back down
8628                          * from. Stop here so caller can process these
8629                          * new nodes.
8630                          */
8631                         break;
8632                 }
8633
8634                 level++;
8635         }
8636
8637         eb = path->nodes[root_level];
8638         if (path->slots[root_level] >= btrfs_header_nritems(eb))
8639                 return 1;
8640
8641         return 0;
8642 }
8643
8644 /*
8645  * root_eb is the subtree root and is locked before this function is called.
8646  */
8647 static int account_shared_subtree(struct btrfs_trans_handle *trans,
8648                                   struct btrfs_root *root,
8649                                   struct extent_buffer *root_eb,
8650                                   u64 root_gen,
8651                                   int root_level)
8652 {
8653         int ret = 0;
8654         int level;
8655         struct extent_buffer *eb = root_eb;
8656         struct btrfs_path *path = NULL;
8657
8658         BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
8659         BUG_ON(root_eb == NULL);
8660
8661         if (!root->fs_info->quota_enabled)
8662                 return 0;
8663
8664         if (!extent_buffer_uptodate(root_eb)) {
8665                 ret = btrfs_read_buffer(root_eb, root_gen);
8666                 if (ret)
8667                         goto out;
8668         }
8669
8670         if (root_level == 0) {
8671                 ret = account_leaf_items(trans, root, root_eb);
8672                 goto out;
8673         }
8674
8675         path = btrfs_alloc_path();
8676         if (!path)
8677                 return -ENOMEM;
8678
8679         /*
8680          * Walk down the tree.  Missing extent blocks are filled in as
8681          * we go. Metadata is accounted every time we read a new
8682          * extent block.
8683          *
8684          * When we reach a leaf, we account for file extent items in it,
8685          * walk back up the tree (adjusting slot pointers as we go)
8686          * and restart the search process.
8687          */
8688         extent_buffer_get(root_eb); /* For path */
8689         path->nodes[root_level] = root_eb;
8690         path->slots[root_level] = 0;
8691         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
8692 walk_down:
8693         level = root_level;
8694         while (level >= 0) {
8695                 if (path->nodes[level] == NULL) {
8696                         int parent_slot;
8697                         u64 child_gen;
8698                         u64 child_bytenr;
8699
8700                         /* We need to get child blockptr/gen from
8701                          * parent before we can read it. */
8702                         eb = path->nodes[level + 1];
8703                         parent_slot = path->slots[level + 1];
8704                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
8705                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
8706
8707                         eb = read_tree_block(root, child_bytenr, child_gen);
8708                         if (IS_ERR(eb)) {
8709                                 ret = PTR_ERR(eb);
8710                                 goto out;
8711                         } else if (!extent_buffer_uptodate(eb)) {
8712                                 free_extent_buffer(eb);
8713                                 ret = -EIO;
8714                                 goto out;
8715                         }
8716
8717                         path->nodes[level] = eb;
8718                         path->slots[level] = 0;
8719
8720                         btrfs_tree_read_lock(eb);
8721                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8722                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8723
8724                         ret = btrfs_qgroup_insert_dirty_extent(trans,
8725                                         root->fs_info, child_bytenr,
8726                                         root->nodesize, GFP_NOFS);
8727                         if (ret)
8728                                 goto out;
8729                 }
8730
8731                 if (level == 0) {
8732                         ret = account_leaf_items(trans, root, path->nodes[level]);
8733                         if (ret)
8734                                 goto out;
8735
8736                         /* Nonzero return here means we completed our search */
8737                         ret = adjust_slots_upwards(root, path, root_level);
8738                         if (ret)
8739                                 break;
8740
8741                         /* Restart search with new slots */
8742                         goto walk_down;
8743                 }
8744
8745                 level--;
8746         }
8747
8748         ret = 0;
8749 out:
8750         btrfs_free_path(path);
8751
8752         return ret;
8753 }
8754
8755 /*
8756  * helper to process tree block while walking down the tree.
8757  *
8758  * when wc->stage == UPDATE_BACKREF, this function updates
8759  * back refs for pointers in the block.
8760  *
8761  * NOTE: return value 1 means we should stop walking down.
8762  */
8763 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8764                                    struct btrfs_root *root,
8765                                    struct btrfs_path *path,
8766                                    struct walk_control *wc, int lookup_info)
8767 {
8768         int level = wc->level;
8769         struct extent_buffer *eb = path->nodes[level];
8770         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8771         int ret;
8772
8773         if (wc->stage == UPDATE_BACKREF &&
8774             btrfs_header_owner(eb) != root->root_key.objectid)
8775                 return 1;
8776
8777         /*
8778          * when reference count of tree block is 1, it won't increase
8779          * again. once full backref flag is set, we never clear it.
8780          */
8781         if (lookup_info &&
8782             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8783              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8784                 BUG_ON(!path->locks[level]);
8785                 ret = btrfs_lookup_extent_info(trans, root,
8786                                                eb->start, level, 1,
8787                                                &wc->refs[level],
8788                                                &wc->flags[level]);
8789                 BUG_ON(ret == -ENOMEM);
8790                 if (ret)
8791                         return ret;
8792                 BUG_ON(wc->refs[level] == 0);
8793         }
8794
8795         if (wc->stage == DROP_REFERENCE) {
8796                 if (wc->refs[level] > 1)
8797                         return 1;
8798
8799                 if (path->locks[level] && !wc->keep_locks) {
8800                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8801                         path->locks[level] = 0;
8802                 }
8803                 return 0;
8804         }
8805
8806         /* wc->stage == UPDATE_BACKREF */
8807         if (!(wc->flags[level] & flag)) {
8808                 BUG_ON(!path->locks[level]);
8809                 ret = btrfs_inc_ref(trans, root, eb, 1);
8810                 BUG_ON(ret); /* -ENOMEM */
8811                 ret = btrfs_dec_ref(trans, root, eb, 0);
8812                 BUG_ON(ret); /* -ENOMEM */
8813                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8814                                                   eb->len, flag,
8815                                                   btrfs_header_level(eb), 0);
8816                 BUG_ON(ret); /* -ENOMEM */
8817                 wc->flags[level] |= flag;
8818         }
8819
8820         /*
8821          * the block is shared by multiple trees, so it's not good to
8822          * keep the tree lock
8823          */
8824         if (path->locks[level] && level > 0) {
8825                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8826                 path->locks[level] = 0;
8827         }
8828         return 0;
8829 }
8830
8831 /*
8832  * helper to process tree block pointer.
8833  *
8834  * when wc->stage == DROP_REFERENCE, this function checks
8835  * reference count of the block pointed to. if the block
8836  * is shared and we need update back refs for the subtree
8837  * rooted at the block, this function changes wc->stage to
8838  * UPDATE_BACKREF. if the block is shared and there is no
8839  * need to update back, this function drops the reference
8840  * to the block.
8841  *
8842  * NOTE: return value 1 means we should stop walking down.
8843  */
8844 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8845                                  struct btrfs_root *root,
8846                                  struct btrfs_path *path,
8847                                  struct walk_control *wc, int *lookup_info)
8848 {
8849         u64 bytenr;
8850         u64 generation;
8851         u64 parent;
8852         u32 blocksize;
8853         struct btrfs_key key;
8854         struct extent_buffer *next;
8855         int level = wc->level;
8856         int reada = 0;
8857         int ret = 0;
8858         bool need_account = false;
8859
8860         generation = btrfs_node_ptr_generation(path->nodes[level],
8861                                                path->slots[level]);
8862         /*
8863          * if the lower level block was created before the snapshot
8864          * was created, we know there is no need to update back refs
8865          * for the subtree
8866          */
8867         if (wc->stage == UPDATE_BACKREF &&
8868             generation <= root->root_key.offset) {
8869                 *lookup_info = 1;
8870                 return 1;
8871         }
8872
8873         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8874         blocksize = root->nodesize;
8875
8876         next = btrfs_find_tree_block(root->fs_info, bytenr);
8877         if (!next) {
8878                 next = btrfs_find_create_tree_block(root, bytenr);
8879                 if (IS_ERR(next))
8880                         return PTR_ERR(next);
8881
8882                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8883                                                level - 1);
8884                 reada = 1;
8885         }
8886         btrfs_tree_lock(next);
8887         btrfs_set_lock_blocking(next);
8888
8889         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8890                                        &wc->refs[level - 1],
8891                                        &wc->flags[level - 1]);
8892         if (ret < 0) {
8893                 btrfs_tree_unlock(next);
8894                 return ret;
8895         }
8896
8897         if (unlikely(wc->refs[level - 1] == 0)) {
8898                 btrfs_err(root->fs_info, "Missing references.");
8899                 BUG();
8900         }
8901         *lookup_info = 0;
8902
8903         if (wc->stage == DROP_REFERENCE) {
8904                 if (wc->refs[level - 1] > 1) {
8905                         need_account = true;
8906                         if (level == 1 &&
8907                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8908                                 goto skip;
8909
8910                         if (!wc->update_ref ||
8911                             generation <= root->root_key.offset)
8912                                 goto skip;
8913
8914                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8915                                               path->slots[level]);
8916                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8917                         if (ret < 0)
8918                                 goto skip;
8919
8920                         wc->stage = UPDATE_BACKREF;
8921                         wc->shared_level = level - 1;
8922                 }
8923         } else {
8924                 if (level == 1 &&
8925                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8926                         goto skip;
8927         }
8928
8929         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8930                 btrfs_tree_unlock(next);
8931                 free_extent_buffer(next);
8932                 next = NULL;
8933                 *lookup_info = 1;
8934         }
8935
8936         if (!next) {
8937                 if (reada && level == 1)
8938                         reada_walk_down(trans, root, wc, path);
8939                 next = read_tree_block(root, bytenr, generation);
8940                 if (IS_ERR(next)) {
8941                         return PTR_ERR(next);
8942                 } else if (!extent_buffer_uptodate(next)) {
8943                         free_extent_buffer(next);
8944                         return -EIO;
8945                 }
8946                 btrfs_tree_lock(next);
8947                 btrfs_set_lock_blocking(next);
8948         }
8949
8950         level--;
8951         BUG_ON(level != btrfs_header_level(next));
8952         path->nodes[level] = next;
8953         path->slots[level] = 0;
8954         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8955         wc->level = level;
8956         if (wc->level == 1)
8957                 wc->reada_slot = 0;
8958         return 0;
8959 skip:
8960         wc->refs[level - 1] = 0;
8961         wc->flags[level - 1] = 0;
8962         if (wc->stage == DROP_REFERENCE) {
8963                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8964                         parent = path->nodes[level]->start;
8965                 } else {
8966                         BUG_ON(root->root_key.objectid !=
8967                                btrfs_header_owner(path->nodes[level]));
8968                         parent = 0;
8969                 }
8970
8971                 if (need_account) {
8972                         ret = account_shared_subtree(trans, root, next,
8973                                                      generation, level - 1);
8974                         if (ret) {
8975                                 btrfs_err_rl(root->fs_info,
8976                                         "Error "
8977                                         "%d accounting shared subtree. Quota "
8978                                         "is out of sync, rescan required.",
8979                                         ret);
8980                         }
8981                 }
8982                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8983                                 root->root_key.objectid, level - 1, 0);
8984                 BUG_ON(ret); /* -ENOMEM */
8985         }
8986         btrfs_tree_unlock(next);
8987         free_extent_buffer(next);
8988         *lookup_info = 1;
8989         return 1;
8990 }
8991
8992 /*
8993  * helper to process tree block while walking up the tree.
8994  *
8995  * when wc->stage == DROP_REFERENCE, this function drops
8996  * reference count on the block.
8997  *
8998  * when wc->stage == UPDATE_BACKREF, this function changes
8999  * wc->stage back to DROP_REFERENCE if we changed wc->stage
9000  * to UPDATE_BACKREF previously while processing the block.
9001  *
9002  * NOTE: return value 1 means we should stop walking up.
9003  */
9004 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9005                                  struct btrfs_root *root,
9006                                  struct btrfs_path *path,
9007                                  struct walk_control *wc)
9008 {
9009         int ret;
9010         int level = wc->level;
9011         struct extent_buffer *eb = path->nodes[level];
9012         u64 parent = 0;
9013
9014         if (wc->stage == UPDATE_BACKREF) {
9015                 BUG_ON(wc->shared_level < level);
9016                 if (level < wc->shared_level)
9017                         goto out;
9018
9019                 ret = find_next_key(path, level + 1, &wc->update_progress);
9020                 if (ret > 0)
9021                         wc->update_ref = 0;
9022
9023                 wc->stage = DROP_REFERENCE;
9024                 wc->shared_level = -1;
9025                 path->slots[level] = 0;
9026
9027                 /*
9028                  * check reference count again if the block isn't locked.
9029                  * we should start walking down the tree again if reference
9030                  * count is one.
9031                  */
9032                 if (!path->locks[level]) {
9033                         BUG_ON(level == 0);
9034                         btrfs_tree_lock(eb);
9035                         btrfs_set_lock_blocking(eb);
9036                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9037
9038                         ret = btrfs_lookup_extent_info(trans, root,
9039                                                        eb->start, level, 1,
9040                                                        &wc->refs[level],
9041                                                        &wc->flags[level]);
9042                         if (ret < 0) {
9043                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9044                                 path->locks[level] = 0;
9045                                 return ret;
9046                         }
9047                         BUG_ON(wc->refs[level] == 0);
9048                         if (wc->refs[level] == 1) {
9049                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9050                                 path->locks[level] = 0;
9051                                 return 1;
9052                         }
9053                 }
9054         }
9055
9056         /* wc->stage == DROP_REFERENCE */
9057         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9058
9059         if (wc->refs[level] == 1) {
9060                 if (level == 0) {
9061                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9062                                 ret = btrfs_dec_ref(trans, root, eb, 1);
9063                         else
9064                                 ret = btrfs_dec_ref(trans, root, eb, 0);
9065                         BUG_ON(ret); /* -ENOMEM */
9066                         ret = account_leaf_items(trans, root, eb);
9067                         if (ret) {
9068                                 btrfs_err_rl(root->fs_info,
9069                                         "error "
9070                                         "%d accounting leaf items. Quota "
9071                                         "is out of sync, rescan required.",
9072                                         ret);
9073                         }
9074                 }
9075                 /* make block locked assertion in clean_tree_block happy */
9076                 if (!path->locks[level] &&
9077                     btrfs_header_generation(eb) == trans->transid) {
9078                         btrfs_tree_lock(eb);
9079                         btrfs_set_lock_blocking(eb);
9080                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9081                 }
9082                 clean_tree_block(trans, root->fs_info, eb);
9083         }
9084
9085         if (eb == root->node) {
9086                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9087                         parent = eb->start;
9088                 else
9089                         BUG_ON(root->root_key.objectid !=
9090                                btrfs_header_owner(eb));
9091         } else {
9092                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9093                         parent = path->nodes[level + 1]->start;
9094                 else
9095                         BUG_ON(root->root_key.objectid !=
9096                                btrfs_header_owner(path->nodes[level + 1]));
9097         }
9098
9099         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9100 out:
9101         wc->refs[level] = 0;
9102         wc->flags[level] = 0;
9103         return 0;
9104 }
9105
9106 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9107                                    struct btrfs_root *root,
9108                                    struct btrfs_path *path,
9109                                    struct walk_control *wc)
9110 {
9111         int level = wc->level;
9112         int lookup_info = 1;
9113         int ret;
9114
9115         while (level >= 0) {
9116                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
9117                 if (ret > 0)
9118                         break;
9119
9120                 if (level == 0)
9121                         break;
9122
9123                 if (path->slots[level] >=
9124                     btrfs_header_nritems(path->nodes[level]))
9125                         break;
9126
9127                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
9128                 if (ret > 0) {
9129                         path->slots[level]++;
9130                         continue;
9131                 } else if (ret < 0)
9132                         return ret;
9133                 level = wc->level;
9134         }
9135         return 0;
9136 }
9137
9138 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9139                                  struct btrfs_root *root,
9140                                  struct btrfs_path *path,
9141                                  struct walk_control *wc, int max_level)
9142 {
9143         int level = wc->level;
9144         int ret;
9145
9146         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9147         while (level < max_level && path->nodes[level]) {
9148                 wc->level = level;
9149                 if (path->slots[level] + 1 <
9150                     btrfs_header_nritems(path->nodes[level])) {
9151                         path->slots[level]++;
9152                         return 0;
9153                 } else {
9154                         ret = walk_up_proc(trans, root, path, wc);
9155                         if (ret > 0)
9156                                 return 0;
9157
9158                         if (path->locks[level]) {
9159                                 btrfs_tree_unlock_rw(path->nodes[level],
9160                                                      path->locks[level]);
9161                                 path->locks[level] = 0;
9162                         }
9163                         free_extent_buffer(path->nodes[level]);
9164                         path->nodes[level] = NULL;
9165                         level++;
9166                 }
9167         }
9168         return 1;
9169 }
9170
9171 /*
9172  * drop a subvolume tree.
9173  *
9174  * this function traverses the tree freeing any blocks that only
9175  * referenced by the tree.
9176  *
9177  * when a shared tree block is found. this function decreases its
9178  * reference count by one. if update_ref is true, this function
9179  * also make sure backrefs for the shared block and all lower level
9180  * blocks are properly updated.
9181  *
9182  * If called with for_reloc == 0, may exit early with -EAGAIN
9183  */
9184 int btrfs_drop_snapshot(struct btrfs_root *root,
9185                          struct btrfs_block_rsv *block_rsv, int update_ref,
9186                          int for_reloc)
9187 {
9188         struct btrfs_path *path;
9189         struct btrfs_trans_handle *trans;
9190         struct btrfs_root *tree_root = root->fs_info->tree_root;
9191         struct btrfs_root_item *root_item = &root->root_item;
9192         struct walk_control *wc;
9193         struct btrfs_key key;
9194         int err = 0;
9195         int ret;
9196         int level;
9197         bool root_dropped = false;
9198
9199         btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
9200
9201         path = btrfs_alloc_path();
9202         if (!path) {
9203                 err = -ENOMEM;
9204                 goto out;
9205         }
9206
9207         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9208         if (!wc) {
9209                 btrfs_free_path(path);
9210                 err = -ENOMEM;
9211                 goto out;
9212         }
9213
9214         trans = btrfs_start_transaction(tree_root, 0);
9215         if (IS_ERR(trans)) {
9216                 err = PTR_ERR(trans);
9217                 goto out_free;
9218         }
9219
9220         if (block_rsv)
9221                 trans->block_rsv = block_rsv;
9222
9223         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9224                 level = btrfs_header_level(root->node);
9225                 path->nodes[level] = btrfs_lock_root_node(root);
9226                 btrfs_set_lock_blocking(path->nodes[level]);
9227                 path->slots[level] = 0;
9228                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9229                 memset(&wc->update_progress, 0,
9230                        sizeof(wc->update_progress));
9231         } else {
9232                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9233                 memcpy(&wc->update_progress, &key,
9234                        sizeof(wc->update_progress));
9235
9236                 level = root_item->drop_level;
9237                 BUG_ON(level == 0);
9238                 path->lowest_level = level;
9239                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9240                 path->lowest_level = 0;
9241                 if (ret < 0) {
9242                         err = ret;
9243                         goto out_end_trans;
9244                 }
9245                 WARN_ON(ret > 0);
9246
9247                 /*
9248                  * unlock our path, this is safe because only this
9249                  * function is allowed to delete this snapshot
9250                  */
9251                 btrfs_unlock_up_safe(path, 0);
9252
9253                 level = btrfs_header_level(root->node);
9254                 while (1) {
9255                         btrfs_tree_lock(path->nodes[level]);
9256                         btrfs_set_lock_blocking(path->nodes[level]);
9257                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9258
9259                         ret = btrfs_lookup_extent_info(trans, root,
9260                                                 path->nodes[level]->start,
9261                                                 level, 1, &wc->refs[level],
9262                                                 &wc->flags[level]);
9263                         if (ret < 0) {
9264                                 err = ret;
9265                                 goto out_end_trans;
9266                         }
9267                         BUG_ON(wc->refs[level] == 0);
9268
9269                         if (level == root_item->drop_level)
9270                                 break;
9271
9272                         btrfs_tree_unlock(path->nodes[level]);
9273                         path->locks[level] = 0;
9274                         WARN_ON(wc->refs[level] != 1);
9275                         level--;
9276                 }
9277         }
9278
9279         wc->level = level;
9280         wc->shared_level = -1;
9281         wc->stage = DROP_REFERENCE;
9282         wc->update_ref = update_ref;
9283         wc->keep_locks = 0;
9284         wc->for_reloc = for_reloc;
9285         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9286
9287         while (1) {
9288
9289                 ret = walk_down_tree(trans, root, path, wc);
9290                 if (ret < 0) {
9291                         err = ret;
9292                         break;
9293                 }
9294
9295                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9296                 if (ret < 0) {
9297                         err = ret;
9298                         break;
9299                 }
9300
9301                 if (ret > 0) {
9302                         BUG_ON(wc->stage != DROP_REFERENCE);
9303                         break;
9304                 }
9305
9306                 if (wc->stage == DROP_REFERENCE) {
9307                         level = wc->level;
9308                         btrfs_node_key(path->nodes[level],
9309                                        &root_item->drop_progress,
9310                                        path->slots[level]);
9311                         root_item->drop_level = level;
9312                 }
9313
9314                 BUG_ON(wc->level == 0);
9315                 if (btrfs_should_end_transaction(trans, tree_root) ||
9316                     (!for_reloc && btrfs_need_cleaner_sleep(root))) {
9317                         ret = btrfs_update_root(trans, tree_root,
9318                                                 &root->root_key,
9319                                                 root_item);
9320                         if (ret) {
9321                                 btrfs_abort_transaction(trans, ret);
9322                                 err = ret;
9323                                 goto out_end_trans;
9324                         }
9325
9326                         btrfs_end_transaction_throttle(trans, tree_root);
9327                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
9328                                 pr_debug("BTRFS: drop snapshot early exit\n");
9329                                 err = -EAGAIN;
9330                                 goto out_free;
9331                         }
9332
9333                         trans = btrfs_start_transaction(tree_root, 0);
9334                         if (IS_ERR(trans)) {
9335                                 err = PTR_ERR(trans);
9336                                 goto out_free;
9337                         }
9338                         if (block_rsv)
9339                                 trans->block_rsv = block_rsv;
9340                 }
9341         }
9342         btrfs_release_path(path);
9343         if (err)
9344                 goto out_end_trans;
9345
9346         ret = btrfs_del_root(trans, tree_root, &root->root_key);
9347         if (ret) {
9348                 btrfs_abort_transaction(trans, ret);
9349                 goto out_end_trans;
9350         }
9351
9352         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9353                 ret = btrfs_find_root(tree_root, &root->root_key, path,
9354                                       NULL, NULL);
9355                 if (ret < 0) {
9356                         btrfs_abort_transaction(trans, ret);
9357                         err = ret;
9358                         goto out_end_trans;
9359                 } else if (ret > 0) {
9360                         /* if we fail to delete the orphan item this time
9361                          * around, it'll get picked up the next time.
9362                          *
9363                          * The most common failure here is just -ENOENT.
9364                          */
9365                         btrfs_del_orphan_item(trans, tree_root,
9366                                               root->root_key.objectid);
9367                 }
9368         }
9369
9370         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9371                 btrfs_add_dropped_root(trans, root);
9372         } else {
9373                 free_extent_buffer(root->node);
9374                 free_extent_buffer(root->commit_root);
9375                 btrfs_put_fs_root(root);
9376         }
9377         root_dropped = true;
9378 out_end_trans:
9379         btrfs_end_transaction_throttle(trans, tree_root);
9380 out_free:
9381         kfree(wc);
9382         btrfs_free_path(path);
9383 out:
9384         /*
9385          * So if we need to stop dropping the snapshot for whatever reason we
9386          * need to make sure to add it back to the dead root list so that we
9387          * keep trying to do the work later.  This also cleans up roots if we
9388          * don't have it in the radix (like when we recover after a power fail
9389          * or unmount) so we don't leak memory.
9390          */
9391         if (!for_reloc && root_dropped == false)
9392                 btrfs_add_dead_root(root);
9393         if (err && err != -EAGAIN)
9394                 btrfs_handle_fs_error(root->fs_info, err, NULL);
9395         return err;
9396 }
9397
9398 /*
9399  * drop subtree rooted at tree block 'node'.
9400  *
9401  * NOTE: this function will unlock and release tree block 'node'
9402  * only used by relocation code
9403  */
9404 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9405                         struct btrfs_root *root,
9406                         struct extent_buffer *node,
9407                         struct extent_buffer *parent)
9408 {
9409         struct btrfs_path *path;
9410         struct walk_control *wc;
9411         int level;
9412         int parent_level;
9413         int ret = 0;
9414         int wret;
9415
9416         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9417
9418         path = btrfs_alloc_path();
9419         if (!path)
9420                 return -ENOMEM;
9421
9422         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9423         if (!wc) {
9424                 btrfs_free_path(path);
9425                 return -ENOMEM;
9426         }
9427
9428         btrfs_assert_tree_locked(parent);
9429         parent_level = btrfs_header_level(parent);
9430         extent_buffer_get(parent);
9431         path->nodes[parent_level] = parent;
9432         path->slots[parent_level] = btrfs_header_nritems(parent);
9433
9434         btrfs_assert_tree_locked(node);
9435         level = btrfs_header_level(node);
9436         path->nodes[level] = node;
9437         path->slots[level] = 0;
9438         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9439
9440         wc->refs[parent_level] = 1;
9441         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9442         wc->level = level;
9443         wc->shared_level = -1;
9444         wc->stage = DROP_REFERENCE;
9445         wc->update_ref = 0;
9446         wc->keep_locks = 1;
9447         wc->for_reloc = 1;
9448         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9449
9450         while (1) {
9451                 wret = walk_down_tree(trans, root, path, wc);
9452                 if (wret < 0) {
9453                         ret = wret;
9454                         break;
9455                 }
9456
9457                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9458                 if (wret < 0)
9459                         ret = wret;
9460                 if (wret != 0)
9461                         break;
9462         }
9463
9464         kfree(wc);
9465         btrfs_free_path(path);
9466         return ret;
9467 }
9468
9469 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
9470 {
9471         u64 num_devices;
9472         u64 stripped;
9473
9474         /*
9475          * if restripe for this chunk_type is on pick target profile and
9476          * return, otherwise do the usual balance
9477          */
9478         stripped = get_restripe_target(root->fs_info, flags);
9479         if (stripped)
9480                 return extended_to_chunk(stripped);
9481
9482         num_devices = root->fs_info->fs_devices->rw_devices;
9483
9484         stripped = BTRFS_BLOCK_GROUP_RAID0 |
9485                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9486                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9487
9488         if (num_devices == 1) {
9489                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9490                 stripped = flags & ~stripped;
9491
9492                 /* turn raid0 into single device chunks */
9493                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9494                         return stripped;
9495
9496                 /* turn mirroring into duplication */
9497                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9498                              BTRFS_BLOCK_GROUP_RAID10))
9499                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9500         } else {
9501                 /* they already had raid on here, just return */
9502                 if (flags & stripped)
9503                         return flags;
9504
9505                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9506                 stripped = flags & ~stripped;
9507
9508                 /* switch duplicated blocks with raid1 */
9509                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9510                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9511
9512                 /* this is drive concat, leave it alone */
9513         }
9514
9515         return flags;
9516 }
9517
9518 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9519 {
9520         struct btrfs_space_info *sinfo = cache->space_info;
9521         u64 num_bytes;
9522         u64 min_allocable_bytes;
9523         int ret = -ENOSPC;
9524
9525         /*
9526          * We need some metadata space and system metadata space for
9527          * allocating chunks in some corner cases until we force to set
9528          * it to be readonly.
9529          */
9530         if ((sinfo->flags &
9531              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9532             !force)
9533                 min_allocable_bytes = SZ_1M;
9534         else
9535                 min_allocable_bytes = 0;
9536
9537         spin_lock(&sinfo->lock);
9538         spin_lock(&cache->lock);
9539
9540         if (cache->ro) {
9541                 cache->ro++;
9542                 ret = 0;
9543                 goto out;
9544         }
9545
9546         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9547                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9548
9549         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
9550             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
9551             min_allocable_bytes <= sinfo->total_bytes) {
9552                 sinfo->bytes_readonly += num_bytes;
9553                 cache->ro++;
9554                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9555                 ret = 0;
9556         }
9557 out:
9558         spin_unlock(&cache->lock);
9559         spin_unlock(&sinfo->lock);
9560         return ret;
9561 }
9562
9563 int btrfs_inc_block_group_ro(struct btrfs_root *root,
9564                              struct btrfs_block_group_cache *cache)
9565
9566 {
9567         struct btrfs_trans_handle *trans;
9568         u64 alloc_flags;
9569         int ret;
9570
9571 again:
9572         trans = btrfs_join_transaction(root);
9573         if (IS_ERR(trans))
9574                 return PTR_ERR(trans);
9575
9576         /*
9577          * we're not allowed to set block groups readonly after the dirty
9578          * block groups cache has started writing.  If it already started,
9579          * back off and let this transaction commit
9580          */
9581         mutex_lock(&root->fs_info->ro_block_group_mutex);
9582         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9583                 u64 transid = trans->transid;
9584
9585                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
9586                 btrfs_end_transaction(trans, root);
9587
9588                 ret = btrfs_wait_for_commit(root, transid);
9589                 if (ret)
9590                         return ret;
9591                 goto again;
9592         }
9593
9594         /*
9595          * if we are changing raid levels, try to allocate a corresponding
9596          * block group with the new raid level.
9597          */
9598         alloc_flags = update_block_group_flags(root, cache->flags);
9599         if (alloc_flags != cache->flags) {
9600                 ret = do_chunk_alloc(trans, root, alloc_flags,
9601                                      CHUNK_ALLOC_FORCE);
9602                 /*
9603                  * ENOSPC is allowed here, we may have enough space
9604                  * already allocated at the new raid level to
9605                  * carry on
9606                  */
9607                 if (ret == -ENOSPC)
9608                         ret = 0;
9609                 if (ret < 0)
9610                         goto out;
9611         }
9612
9613         ret = inc_block_group_ro(cache, 0);
9614         if (!ret)
9615                 goto out;
9616         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
9617         ret = do_chunk_alloc(trans, root, alloc_flags,
9618                              CHUNK_ALLOC_FORCE);
9619         if (ret < 0)
9620                 goto out;
9621         ret = inc_block_group_ro(cache, 0);
9622 out:
9623         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9624                 alloc_flags = update_block_group_flags(root, cache->flags);
9625                 lock_chunks(root->fs_info->chunk_root);
9626                 check_system_chunk(trans, root, alloc_flags);
9627                 unlock_chunks(root->fs_info->chunk_root);
9628         }
9629         mutex_unlock(&root->fs_info->ro_block_group_mutex);
9630
9631         btrfs_end_transaction(trans, root);
9632         return ret;
9633 }
9634
9635 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9636                             struct btrfs_root *root, u64 type)
9637 {
9638         u64 alloc_flags = get_alloc_profile(root, type);
9639         return do_chunk_alloc(trans, root, alloc_flags,
9640                               CHUNK_ALLOC_FORCE);
9641 }
9642
9643 /*
9644  * helper to account the unused space of all the readonly block group in the
9645  * space_info. takes mirrors into account.
9646  */
9647 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9648 {
9649         struct btrfs_block_group_cache *block_group;
9650         u64 free_bytes = 0;
9651         int factor;
9652
9653         /* It's df, we don't care if it's racy */
9654         if (list_empty(&sinfo->ro_bgs))
9655                 return 0;
9656
9657         spin_lock(&sinfo->lock);
9658         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9659                 spin_lock(&block_group->lock);
9660
9661                 if (!block_group->ro) {
9662                         spin_unlock(&block_group->lock);
9663                         continue;
9664                 }
9665
9666                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9667                                           BTRFS_BLOCK_GROUP_RAID10 |
9668                                           BTRFS_BLOCK_GROUP_DUP))
9669                         factor = 2;
9670                 else
9671                         factor = 1;
9672
9673                 free_bytes += (block_group->key.offset -
9674                                btrfs_block_group_used(&block_group->item)) *
9675                                factor;
9676
9677                 spin_unlock(&block_group->lock);
9678         }
9679         spin_unlock(&sinfo->lock);
9680
9681         return free_bytes;
9682 }
9683
9684 void btrfs_dec_block_group_ro(struct btrfs_root *root,
9685                               struct btrfs_block_group_cache *cache)
9686 {
9687         struct btrfs_space_info *sinfo = cache->space_info;
9688         u64 num_bytes;
9689
9690         BUG_ON(!cache->ro);
9691
9692         spin_lock(&sinfo->lock);
9693         spin_lock(&cache->lock);
9694         if (!--cache->ro) {
9695                 num_bytes = cache->key.offset - cache->reserved -
9696                             cache->pinned - cache->bytes_super -
9697                             btrfs_block_group_used(&cache->item);
9698                 sinfo->bytes_readonly -= num_bytes;
9699                 list_del_init(&cache->ro_list);
9700         }
9701         spin_unlock(&cache->lock);
9702         spin_unlock(&sinfo->lock);
9703 }
9704
9705 /*
9706  * checks to see if its even possible to relocate this block group.
9707  *
9708  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9709  * ok to go ahead and try.
9710  */
9711 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
9712 {
9713         struct btrfs_block_group_cache *block_group;
9714         struct btrfs_space_info *space_info;
9715         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
9716         struct btrfs_device *device;
9717         struct btrfs_trans_handle *trans;
9718         u64 min_free;
9719         u64 dev_min = 1;
9720         u64 dev_nr = 0;
9721         u64 target;
9722         int debug;
9723         int index;
9724         int full = 0;
9725         int ret = 0;
9726
9727         debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
9728
9729         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9730
9731         /* odd, couldn't find the block group, leave it alone */
9732         if (!block_group) {
9733                 if (debug)
9734                         btrfs_warn(root->fs_info,
9735                                    "can't find block group for bytenr %llu",
9736                                    bytenr);
9737                 return -1;
9738         }
9739
9740         min_free = btrfs_block_group_used(&block_group->item);
9741
9742         /* no bytes used, we're good */
9743         if (!min_free)
9744                 goto out;
9745
9746         space_info = block_group->space_info;
9747         spin_lock(&space_info->lock);
9748
9749         full = space_info->full;
9750
9751         /*
9752          * if this is the last block group we have in this space, we can't
9753          * relocate it unless we're able to allocate a new chunk below.
9754          *
9755          * Otherwise, we need to make sure we have room in the space to handle
9756          * all of the extents from this block group.  If we can, we're good
9757          */
9758         if ((space_info->total_bytes != block_group->key.offset) &&
9759             (space_info->bytes_used + space_info->bytes_reserved +
9760              space_info->bytes_pinned + space_info->bytes_readonly +
9761              min_free < space_info->total_bytes)) {
9762                 spin_unlock(&space_info->lock);
9763                 goto out;
9764         }
9765         spin_unlock(&space_info->lock);
9766
9767         /*
9768          * ok we don't have enough space, but maybe we have free space on our
9769          * devices to allocate new chunks for relocation, so loop through our
9770          * alloc devices and guess if we have enough space.  if this block
9771          * group is going to be restriped, run checks against the target
9772          * profile instead of the current one.
9773          */
9774         ret = -1;
9775
9776         /*
9777          * index:
9778          *      0: raid10
9779          *      1: raid1
9780          *      2: dup
9781          *      3: raid0
9782          *      4: single
9783          */
9784         target = get_restripe_target(root->fs_info, block_group->flags);
9785         if (target) {
9786                 index = __get_raid_index(extended_to_chunk(target));
9787         } else {
9788                 /*
9789                  * this is just a balance, so if we were marked as full
9790                  * we know there is no space for a new chunk
9791                  */
9792                 if (full) {
9793                         if (debug)
9794                                 btrfs_warn(root->fs_info,
9795                                         "no space to alloc new chunk for block group %llu",
9796                                         block_group->key.objectid);
9797                         goto out;
9798                 }
9799
9800                 index = get_block_group_index(block_group);
9801         }
9802
9803         if (index == BTRFS_RAID_RAID10) {
9804                 dev_min = 4;
9805                 /* Divide by 2 */
9806                 min_free >>= 1;
9807         } else if (index == BTRFS_RAID_RAID1) {
9808                 dev_min = 2;
9809         } else if (index == BTRFS_RAID_DUP) {
9810                 /* Multiply by 2 */
9811                 min_free <<= 1;
9812         } else if (index == BTRFS_RAID_RAID0) {
9813                 dev_min = fs_devices->rw_devices;
9814                 min_free = div64_u64(min_free, dev_min);
9815         }
9816
9817         /* We need to do this so that we can look at pending chunks */
9818         trans = btrfs_join_transaction(root);
9819         if (IS_ERR(trans)) {
9820                 ret = PTR_ERR(trans);
9821                 goto out;
9822         }
9823
9824         mutex_lock(&root->fs_info->chunk_mutex);
9825         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9826                 u64 dev_offset;
9827
9828                 /*
9829                  * check to make sure we can actually find a chunk with enough
9830                  * space to fit our block group in.
9831                  */
9832                 if (device->total_bytes > device->bytes_used + min_free &&
9833                     !device->is_tgtdev_for_dev_replace) {
9834                         ret = find_free_dev_extent(trans, device, min_free,
9835                                                    &dev_offset, NULL);
9836                         if (!ret)
9837                                 dev_nr++;
9838
9839                         if (dev_nr >= dev_min)
9840                                 break;
9841
9842                         ret = -1;
9843                 }
9844         }
9845         if (debug && ret == -1)
9846                 btrfs_warn(root->fs_info,
9847                         "no space to allocate a new chunk for block group %llu",
9848                         block_group->key.objectid);
9849         mutex_unlock(&root->fs_info->chunk_mutex);
9850         btrfs_end_transaction(trans, root);
9851 out:
9852         btrfs_put_block_group(block_group);
9853         return ret;
9854 }
9855
9856 static int find_first_block_group(struct btrfs_root *root,
9857                 struct btrfs_path *path, struct btrfs_key *key)
9858 {
9859         int ret = 0;
9860         struct btrfs_key found_key;
9861         struct extent_buffer *leaf;
9862         int slot;
9863
9864         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9865         if (ret < 0)
9866                 goto out;
9867
9868         while (1) {
9869                 slot = path->slots[0];
9870                 leaf = path->nodes[0];
9871                 if (slot >= btrfs_header_nritems(leaf)) {
9872                         ret = btrfs_next_leaf(root, path);
9873                         if (ret == 0)
9874                                 continue;
9875                         if (ret < 0)
9876                                 goto out;
9877                         break;
9878                 }
9879                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9880
9881                 if (found_key.objectid >= key->objectid &&
9882                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9883                         struct extent_map_tree *em_tree;
9884                         struct extent_map *em;
9885
9886                         em_tree = &root->fs_info->mapping_tree.map_tree;
9887                         read_lock(&em_tree->lock);
9888                         em = lookup_extent_mapping(em_tree, found_key.objectid,
9889                                                    found_key.offset);
9890                         read_unlock(&em_tree->lock);
9891                         if (!em) {
9892                                 btrfs_err(root->fs_info,
9893                         "logical %llu len %llu found bg but no related chunk",
9894                                           found_key.objectid, found_key.offset);
9895                                 ret = -ENOENT;
9896                         } else {
9897                                 ret = 0;
9898                         }
9899                         goto out;
9900                 }
9901                 path->slots[0]++;
9902         }
9903 out:
9904         return ret;
9905 }
9906
9907 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9908 {
9909         struct btrfs_block_group_cache *block_group;
9910         u64 last = 0;
9911
9912         while (1) {
9913                 struct inode *inode;
9914
9915                 block_group = btrfs_lookup_first_block_group(info, last);
9916                 while (block_group) {
9917                         spin_lock(&block_group->lock);
9918                         if (block_group->iref)
9919                                 break;
9920                         spin_unlock(&block_group->lock);
9921                         block_group = next_block_group(info->tree_root,
9922                                                        block_group);
9923                 }
9924                 if (!block_group) {
9925                         if (last == 0)
9926                                 break;
9927                         last = 0;
9928                         continue;
9929                 }
9930
9931                 inode = block_group->inode;
9932                 block_group->iref = 0;
9933                 block_group->inode = NULL;
9934                 spin_unlock(&block_group->lock);
9935                 ASSERT(block_group->io_ctl.inode == NULL);
9936                 iput(inode);
9937                 last = block_group->key.objectid + block_group->key.offset;
9938                 btrfs_put_block_group(block_group);
9939         }
9940 }
9941
9942 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9943 {
9944         struct btrfs_block_group_cache *block_group;
9945         struct btrfs_space_info *space_info;
9946         struct btrfs_caching_control *caching_ctl;
9947         struct rb_node *n;
9948
9949         down_write(&info->commit_root_sem);
9950         while (!list_empty(&info->caching_block_groups)) {
9951                 caching_ctl = list_entry(info->caching_block_groups.next,
9952                                          struct btrfs_caching_control, list);
9953                 list_del(&caching_ctl->list);
9954                 put_caching_control(caching_ctl);
9955         }
9956         up_write(&info->commit_root_sem);
9957
9958         spin_lock(&info->unused_bgs_lock);
9959         while (!list_empty(&info->unused_bgs)) {
9960                 block_group = list_first_entry(&info->unused_bgs,
9961                                                struct btrfs_block_group_cache,
9962                                                bg_list);
9963                 list_del_init(&block_group->bg_list);
9964                 btrfs_put_block_group(block_group);
9965         }
9966         spin_unlock(&info->unused_bgs_lock);
9967
9968         spin_lock(&info->block_group_cache_lock);
9969         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9970                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9971                                        cache_node);
9972                 rb_erase(&block_group->cache_node,
9973                          &info->block_group_cache_tree);
9974                 RB_CLEAR_NODE(&block_group->cache_node);
9975                 spin_unlock(&info->block_group_cache_lock);
9976
9977                 down_write(&block_group->space_info->groups_sem);
9978                 list_del(&block_group->list);
9979                 up_write(&block_group->space_info->groups_sem);
9980
9981                 if (block_group->cached == BTRFS_CACHE_STARTED)
9982                         wait_block_group_cache_done(block_group);
9983
9984                 /*
9985                  * We haven't cached this block group, which means we could
9986                  * possibly have excluded extents on this block group.
9987                  */
9988                 if (block_group->cached == BTRFS_CACHE_NO ||
9989                     block_group->cached == BTRFS_CACHE_ERROR)
9990                         free_excluded_extents(info->extent_root, block_group);
9991
9992                 btrfs_remove_free_space_cache(block_group);
9993                 ASSERT(list_empty(&block_group->dirty_list));
9994                 ASSERT(list_empty(&block_group->io_list));
9995                 ASSERT(list_empty(&block_group->bg_list));
9996                 ASSERT(atomic_read(&block_group->count) == 1);
9997                 btrfs_put_block_group(block_group);
9998
9999                 spin_lock(&info->block_group_cache_lock);
10000         }
10001         spin_unlock(&info->block_group_cache_lock);
10002
10003         /* now that all the block groups are freed, go through and
10004          * free all the space_info structs.  This is only called during
10005          * the final stages of unmount, and so we know nobody is
10006          * using them.  We call synchronize_rcu() once before we start,
10007          * just to be on the safe side.
10008          */
10009         synchronize_rcu();
10010
10011         release_global_block_rsv(info);
10012
10013         while (!list_empty(&info->space_info)) {
10014                 int i;
10015
10016                 space_info = list_entry(info->space_info.next,
10017                                         struct btrfs_space_info,
10018                                         list);
10019
10020                 /*
10021                  * Do not hide this behind enospc_debug, this is actually
10022                  * important and indicates a real bug if this happens.
10023                  */
10024                 if (WARN_ON(space_info->bytes_pinned > 0 ||
10025                             space_info->bytes_reserved > 0 ||
10026                             space_info->bytes_may_use > 0))
10027                         dump_space_info(space_info, 0, 0);
10028                 list_del(&space_info->list);
10029                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10030                         struct kobject *kobj;
10031                         kobj = space_info->block_group_kobjs[i];
10032                         space_info->block_group_kobjs[i] = NULL;
10033                         if (kobj) {
10034                                 kobject_del(kobj);
10035                                 kobject_put(kobj);
10036                         }
10037                 }
10038                 kobject_del(&space_info->kobj);
10039                 kobject_put(&space_info->kobj);
10040         }
10041         return 0;
10042 }
10043
10044 static void __link_block_group(struct btrfs_space_info *space_info,
10045                                struct btrfs_block_group_cache *cache)
10046 {
10047         int index = get_block_group_index(cache);
10048         bool first = false;
10049
10050         down_write(&space_info->groups_sem);
10051         if (list_empty(&space_info->block_groups[index]))
10052                 first = true;
10053         list_add_tail(&cache->list, &space_info->block_groups[index]);
10054         up_write(&space_info->groups_sem);
10055
10056         if (first) {
10057                 struct raid_kobject *rkobj;
10058                 int ret;
10059
10060                 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10061                 if (!rkobj)
10062                         goto out_err;
10063                 rkobj->raid_type = index;
10064                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10065                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10066                                   "%s", get_raid_name(index));
10067                 if (ret) {
10068                         kobject_put(&rkobj->kobj);
10069                         goto out_err;
10070                 }
10071                 space_info->block_group_kobjs[index] = &rkobj->kobj;
10072         }
10073
10074         return;
10075 out_err:
10076         pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
10077 }
10078
10079 static struct btrfs_block_group_cache *
10080 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
10081 {
10082         struct btrfs_block_group_cache *cache;
10083
10084         cache = kzalloc(sizeof(*cache), GFP_NOFS);
10085         if (!cache)
10086                 return NULL;
10087
10088         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10089                                         GFP_NOFS);
10090         if (!cache->free_space_ctl) {
10091                 kfree(cache);
10092                 return NULL;
10093         }
10094
10095         cache->key.objectid = start;
10096         cache->key.offset = size;
10097         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10098
10099         cache->sectorsize = root->sectorsize;
10100         cache->fs_info = root->fs_info;
10101         cache->full_stripe_len = btrfs_full_stripe_len(root,
10102                                                &root->fs_info->mapping_tree,
10103                                                start);
10104         set_free_space_tree_thresholds(cache);
10105
10106         atomic_set(&cache->count, 1);
10107         spin_lock_init(&cache->lock);
10108         init_rwsem(&cache->data_rwsem);
10109         INIT_LIST_HEAD(&cache->list);
10110         INIT_LIST_HEAD(&cache->cluster_list);
10111         INIT_LIST_HEAD(&cache->bg_list);
10112         INIT_LIST_HEAD(&cache->ro_list);
10113         INIT_LIST_HEAD(&cache->dirty_list);
10114         INIT_LIST_HEAD(&cache->io_list);
10115         btrfs_init_free_space_ctl(cache);
10116         atomic_set(&cache->trimming, 0);
10117         mutex_init(&cache->free_space_lock);
10118
10119         return cache;
10120 }
10121
10122 int btrfs_read_block_groups(struct btrfs_root *root)
10123 {
10124         struct btrfs_path *path;
10125         int ret;
10126         struct btrfs_block_group_cache *cache;
10127         struct btrfs_fs_info *info = root->fs_info;
10128         struct btrfs_space_info *space_info;
10129         struct btrfs_key key;
10130         struct btrfs_key found_key;
10131         struct extent_buffer *leaf;
10132         int need_clear = 0;
10133         u64 cache_gen;
10134
10135         root = info->extent_root;
10136         key.objectid = 0;
10137         key.offset = 0;
10138         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10139         path = btrfs_alloc_path();
10140         if (!path)
10141                 return -ENOMEM;
10142         path->reada = READA_FORWARD;
10143
10144         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
10145         if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
10146             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
10147                 need_clear = 1;
10148         if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
10149                 need_clear = 1;
10150
10151         while (1) {
10152                 ret = find_first_block_group(root, path, &key);
10153                 if (ret > 0)
10154                         break;
10155                 if (ret != 0)
10156                         goto error;
10157
10158                 leaf = path->nodes[0];
10159                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10160
10161                 cache = btrfs_create_block_group_cache(root, found_key.objectid,
10162                                                        found_key.offset);
10163                 if (!cache) {
10164                         ret = -ENOMEM;
10165                         goto error;
10166                 }
10167
10168                 if (need_clear) {
10169                         /*
10170                          * When we mount with old space cache, we need to
10171                          * set BTRFS_DC_CLEAR and set dirty flag.
10172                          *
10173                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10174                          *    truncate the old free space cache inode and
10175                          *    setup a new one.
10176                          * b) Setting 'dirty flag' makes sure that we flush
10177                          *    the new space cache info onto disk.
10178                          */
10179                         if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
10180                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
10181                 }
10182
10183                 read_extent_buffer(leaf, &cache->item,
10184                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
10185                                    sizeof(cache->item));
10186                 cache->flags = btrfs_block_group_flags(&cache->item);
10187
10188                 key.objectid = found_key.objectid + found_key.offset;
10189                 btrfs_release_path(path);
10190
10191                 /*
10192                  * We need to exclude the super stripes now so that the space
10193                  * info has super bytes accounted for, otherwise we'll think
10194                  * we have more space than we actually do.
10195                  */
10196                 ret = exclude_super_stripes(root, cache);
10197                 if (ret) {
10198                         /*
10199                          * We may have excluded something, so call this just in
10200                          * case.
10201                          */
10202                         free_excluded_extents(root, cache);
10203                         btrfs_put_block_group(cache);
10204                         goto error;
10205                 }
10206
10207                 /*
10208                  * check for two cases, either we are full, and therefore
10209                  * don't need to bother with the caching work since we won't
10210                  * find any space, or we are empty, and we can just add all
10211                  * the space in and be done with it.  This saves us _alot_ of
10212                  * time, particularly in the full case.
10213                  */
10214                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10215                         cache->last_byte_to_unpin = (u64)-1;
10216                         cache->cached = BTRFS_CACHE_FINISHED;
10217                         free_excluded_extents(root, cache);
10218                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10219                         cache->last_byte_to_unpin = (u64)-1;
10220                         cache->cached = BTRFS_CACHE_FINISHED;
10221                         add_new_free_space(cache, root->fs_info,
10222                                            found_key.objectid,
10223                                            found_key.objectid +
10224                                            found_key.offset);
10225                         free_excluded_extents(root, cache);
10226                 }
10227
10228                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
10229                 if (ret) {
10230                         btrfs_remove_free_space_cache(cache);
10231                         btrfs_put_block_group(cache);
10232                         goto error;
10233                 }
10234
10235                 trace_btrfs_add_block_group(root->fs_info, cache, 0);
10236                 ret = update_space_info(info, cache->flags, found_key.offset,
10237                                         btrfs_block_group_used(&cache->item),
10238                                         cache->bytes_super, &space_info);
10239                 if (ret) {
10240                         btrfs_remove_free_space_cache(cache);
10241                         spin_lock(&info->block_group_cache_lock);
10242                         rb_erase(&cache->cache_node,
10243                                  &info->block_group_cache_tree);
10244                         RB_CLEAR_NODE(&cache->cache_node);
10245                         spin_unlock(&info->block_group_cache_lock);
10246                         btrfs_put_block_group(cache);
10247                         goto error;
10248                 }
10249
10250                 cache->space_info = space_info;
10251
10252                 __link_block_group(space_info, cache);
10253
10254                 set_avail_alloc_bits(root->fs_info, cache->flags);
10255                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
10256                         inc_block_group_ro(cache, 1);
10257                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10258                         spin_lock(&info->unused_bgs_lock);
10259                         /* Should always be true but just in case. */
10260                         if (list_empty(&cache->bg_list)) {
10261                                 btrfs_get_block_group(cache);
10262                                 list_add_tail(&cache->bg_list,
10263                                               &info->unused_bgs);
10264                         }
10265                         spin_unlock(&info->unused_bgs_lock);
10266                 }
10267         }
10268
10269         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
10270                 if (!(get_alloc_profile(root, space_info->flags) &
10271                       (BTRFS_BLOCK_GROUP_RAID10 |
10272                        BTRFS_BLOCK_GROUP_RAID1 |
10273                        BTRFS_BLOCK_GROUP_RAID5 |
10274                        BTRFS_BLOCK_GROUP_RAID6 |
10275                        BTRFS_BLOCK_GROUP_DUP)))
10276                         continue;
10277                 /*
10278                  * avoid allocating from un-mirrored block group if there are
10279                  * mirrored block groups.
10280                  */
10281                 list_for_each_entry(cache,
10282                                 &space_info->block_groups[BTRFS_RAID_RAID0],
10283                                 list)
10284                         inc_block_group_ro(cache, 1);
10285                 list_for_each_entry(cache,
10286                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
10287                                 list)
10288                         inc_block_group_ro(cache, 1);
10289         }
10290
10291         init_global_block_rsv(info);
10292         ret = 0;
10293 error:
10294         btrfs_free_path(path);
10295         return ret;
10296 }
10297
10298 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
10299                                        struct btrfs_root *root)
10300 {
10301         struct btrfs_block_group_cache *block_group, *tmp;
10302         struct btrfs_root *extent_root = root->fs_info->extent_root;
10303         struct btrfs_block_group_item item;
10304         struct btrfs_key key;
10305         int ret = 0;
10306         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10307
10308         trans->can_flush_pending_bgs = false;
10309         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10310                 if (ret)
10311                         goto next;
10312
10313                 spin_lock(&block_group->lock);
10314                 memcpy(&item, &block_group->item, sizeof(item));
10315                 memcpy(&key, &block_group->key, sizeof(key));
10316                 spin_unlock(&block_group->lock);
10317
10318                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10319                                         sizeof(item));
10320                 if (ret)
10321                         btrfs_abort_transaction(trans, ret);
10322                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
10323                                                key.objectid, key.offset);
10324                 if (ret)
10325                         btrfs_abort_transaction(trans, ret);
10326                 add_block_group_free_space(trans, root->fs_info, block_group);
10327                 /* already aborted the transaction if it failed. */
10328 next:
10329                 list_del_init(&block_group->bg_list);
10330         }
10331         trans->can_flush_pending_bgs = can_flush_pending_bgs;
10332 }
10333
10334 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10335                            struct btrfs_root *root, u64 bytes_used,
10336                            u64 type, u64 chunk_objectid, u64 chunk_offset,
10337                            u64 size)
10338 {
10339         int ret;
10340         struct btrfs_root *extent_root;
10341         struct btrfs_block_group_cache *cache;
10342         extent_root = root->fs_info->extent_root;
10343
10344         btrfs_set_log_full_commit(root->fs_info, trans);
10345
10346         cache = btrfs_create_block_group_cache(root, chunk_offset, size);
10347         if (!cache)
10348                 return -ENOMEM;
10349
10350         btrfs_set_block_group_used(&cache->item, bytes_used);
10351         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
10352         btrfs_set_block_group_flags(&cache->item, type);
10353
10354         cache->flags = type;
10355         cache->last_byte_to_unpin = (u64)-1;
10356         cache->cached = BTRFS_CACHE_FINISHED;
10357         cache->needs_free_space = 1;
10358         ret = exclude_super_stripes(root, cache);
10359         if (ret) {
10360                 /*
10361                  * We may have excluded something, so call this just in
10362                  * case.
10363                  */
10364                 free_excluded_extents(root, cache);
10365                 btrfs_put_block_group(cache);
10366                 return ret;
10367         }
10368
10369         add_new_free_space(cache, root->fs_info, chunk_offset,
10370                            chunk_offset + size);
10371
10372         free_excluded_extents(root, cache);
10373
10374 #ifdef CONFIG_BTRFS_DEBUG
10375         if (btrfs_should_fragment_free_space(root, cache)) {
10376                 u64 new_bytes_used = size - bytes_used;
10377
10378                 bytes_used += new_bytes_used >> 1;
10379                 fragment_free_space(root, cache);
10380         }
10381 #endif
10382         /*
10383          * Call to ensure the corresponding space_info object is created and
10384          * assigned to our block group, but don't update its counters just yet.
10385          * We want our bg to be added to the rbtree with its ->space_info set.
10386          */
10387         ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
10388                                 &cache->space_info);
10389         if (ret) {
10390                 btrfs_remove_free_space_cache(cache);
10391                 btrfs_put_block_group(cache);
10392                 return ret;
10393         }
10394
10395         ret = btrfs_add_block_group_cache(root->fs_info, cache);
10396         if (ret) {
10397                 btrfs_remove_free_space_cache(cache);
10398                 btrfs_put_block_group(cache);
10399                 return ret;
10400         }
10401
10402         /*
10403          * Now that our block group has its ->space_info set and is inserted in
10404          * the rbtree, update the space info's counters.
10405          */
10406         trace_btrfs_add_block_group(root->fs_info, cache, 1);
10407         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
10408                                 cache->bytes_super, &cache->space_info);
10409         if (ret) {
10410                 btrfs_remove_free_space_cache(cache);
10411                 spin_lock(&root->fs_info->block_group_cache_lock);
10412                 rb_erase(&cache->cache_node,
10413                          &root->fs_info->block_group_cache_tree);
10414                 RB_CLEAR_NODE(&cache->cache_node);
10415                 spin_unlock(&root->fs_info->block_group_cache_lock);
10416                 btrfs_put_block_group(cache);
10417                 return ret;
10418         }
10419         update_global_block_rsv(root->fs_info);
10420
10421         __link_block_group(cache->space_info, cache);
10422
10423         list_add_tail(&cache->bg_list, &trans->new_bgs);
10424
10425         set_avail_alloc_bits(extent_root->fs_info, type);
10426         return 0;
10427 }
10428
10429 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10430 {
10431         u64 extra_flags = chunk_to_extended(flags) &
10432                                 BTRFS_EXTENDED_PROFILE_MASK;
10433
10434         write_seqlock(&fs_info->profiles_lock);
10435         if (flags & BTRFS_BLOCK_GROUP_DATA)
10436                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10437         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10438                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10439         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10440                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10441         write_sequnlock(&fs_info->profiles_lock);
10442 }
10443
10444 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10445                              struct btrfs_root *root, u64 group_start,
10446                              struct extent_map *em)
10447 {
10448         struct btrfs_path *path;
10449         struct btrfs_block_group_cache *block_group;
10450         struct btrfs_free_cluster *cluster;
10451         struct btrfs_root *tree_root = root->fs_info->tree_root;
10452         struct btrfs_key key;
10453         struct inode *inode;
10454         struct kobject *kobj = NULL;
10455         int ret;
10456         int index;
10457         int factor;
10458         struct btrfs_caching_control *caching_ctl = NULL;
10459         bool remove_em;
10460
10461         root = root->fs_info->extent_root;
10462
10463         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
10464         BUG_ON(!block_group);
10465         BUG_ON(!block_group->ro);
10466
10467         /*
10468          * Free the reserved super bytes from this block group before
10469          * remove it.
10470          */
10471         free_excluded_extents(root, block_group);
10472
10473         memcpy(&key, &block_group->key, sizeof(key));
10474         index = get_block_group_index(block_group);
10475         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10476                                   BTRFS_BLOCK_GROUP_RAID1 |
10477                                   BTRFS_BLOCK_GROUP_RAID10))
10478                 factor = 2;
10479         else
10480                 factor = 1;
10481
10482         /* make sure this block group isn't part of an allocation cluster */
10483         cluster = &root->fs_info->data_alloc_cluster;
10484         spin_lock(&cluster->refill_lock);
10485         btrfs_return_cluster_to_free_space(block_group, cluster);
10486         spin_unlock(&cluster->refill_lock);
10487
10488         /*
10489          * make sure this block group isn't part of a metadata
10490          * allocation cluster
10491          */
10492         cluster = &root->fs_info->meta_alloc_cluster;
10493         spin_lock(&cluster->refill_lock);
10494         btrfs_return_cluster_to_free_space(block_group, cluster);
10495         spin_unlock(&cluster->refill_lock);
10496
10497         path = btrfs_alloc_path();
10498         if (!path) {
10499                 ret = -ENOMEM;
10500                 goto out;
10501         }
10502
10503         /*
10504          * get the inode first so any iput calls done for the io_list
10505          * aren't the final iput (no unlinks allowed now)
10506          */
10507         inode = lookup_free_space_inode(tree_root, block_group, path);
10508
10509         mutex_lock(&trans->transaction->cache_write_mutex);
10510         /*
10511          * make sure our free spache cache IO is done before remove the
10512          * free space inode
10513          */
10514         spin_lock(&trans->transaction->dirty_bgs_lock);
10515         if (!list_empty(&block_group->io_list)) {
10516                 list_del_init(&block_group->io_list);
10517
10518                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10519
10520                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10521                 btrfs_wait_cache_io(root, trans, block_group,
10522                                     &block_group->io_ctl, path,
10523                                     block_group->key.objectid);
10524                 btrfs_put_block_group(block_group);
10525                 spin_lock(&trans->transaction->dirty_bgs_lock);
10526         }
10527
10528         if (!list_empty(&block_group->dirty_list)) {
10529                 list_del_init(&block_group->dirty_list);
10530                 btrfs_put_block_group(block_group);
10531         }
10532         spin_unlock(&trans->transaction->dirty_bgs_lock);
10533         mutex_unlock(&trans->transaction->cache_write_mutex);
10534
10535         if (!IS_ERR(inode)) {
10536                 ret = btrfs_orphan_add(trans, inode);
10537                 if (ret) {
10538                         btrfs_add_delayed_iput(inode);
10539                         goto out;
10540                 }
10541                 clear_nlink(inode);
10542                 /* One for the block groups ref */
10543                 spin_lock(&block_group->lock);
10544                 if (block_group->iref) {
10545                         block_group->iref = 0;
10546                         block_group->inode = NULL;
10547                         spin_unlock(&block_group->lock);
10548                         iput(inode);
10549                 } else {
10550                         spin_unlock(&block_group->lock);
10551                 }
10552                 /* One for our lookup ref */
10553                 btrfs_add_delayed_iput(inode);
10554         }
10555
10556         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10557         key.offset = block_group->key.objectid;
10558         key.type = 0;
10559
10560         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10561         if (ret < 0)
10562                 goto out;
10563         if (ret > 0)
10564                 btrfs_release_path(path);
10565         if (ret == 0) {
10566                 ret = btrfs_del_item(trans, tree_root, path);
10567                 if (ret)
10568                         goto out;
10569                 btrfs_release_path(path);
10570         }
10571
10572         spin_lock(&root->fs_info->block_group_cache_lock);
10573         rb_erase(&block_group->cache_node,
10574                  &root->fs_info->block_group_cache_tree);
10575         RB_CLEAR_NODE(&block_group->cache_node);
10576
10577         if (root->fs_info->first_logical_byte == block_group->key.objectid)
10578                 root->fs_info->first_logical_byte = (u64)-1;
10579         spin_unlock(&root->fs_info->block_group_cache_lock);
10580
10581         down_write(&block_group->space_info->groups_sem);
10582         /*
10583          * we must use list_del_init so people can check to see if they
10584          * are still on the list after taking the semaphore
10585          */
10586         list_del_init(&block_group->list);
10587         if (list_empty(&block_group->space_info->block_groups[index])) {
10588                 kobj = block_group->space_info->block_group_kobjs[index];
10589                 block_group->space_info->block_group_kobjs[index] = NULL;
10590                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
10591         }
10592         up_write(&block_group->space_info->groups_sem);
10593         if (kobj) {
10594                 kobject_del(kobj);
10595                 kobject_put(kobj);
10596         }
10597
10598         if (block_group->has_caching_ctl)
10599                 caching_ctl = get_caching_control(block_group);
10600         if (block_group->cached == BTRFS_CACHE_STARTED)
10601                 wait_block_group_cache_done(block_group);
10602         if (block_group->has_caching_ctl) {
10603                 down_write(&root->fs_info->commit_root_sem);
10604                 if (!caching_ctl) {
10605                         struct btrfs_caching_control *ctl;
10606
10607                         list_for_each_entry(ctl,
10608                                     &root->fs_info->caching_block_groups, list)
10609                                 if (ctl->block_group == block_group) {
10610                                         caching_ctl = ctl;
10611                                         atomic_inc(&caching_ctl->count);
10612                                         break;
10613                                 }
10614                 }
10615                 if (caching_ctl)
10616                         list_del_init(&caching_ctl->list);
10617                 up_write(&root->fs_info->commit_root_sem);
10618                 if (caching_ctl) {
10619                         /* Once for the caching bgs list and once for us. */
10620                         put_caching_control(caching_ctl);
10621                         put_caching_control(caching_ctl);
10622                 }
10623         }
10624
10625         spin_lock(&trans->transaction->dirty_bgs_lock);
10626         if (!list_empty(&block_group->dirty_list)) {
10627                 WARN_ON(1);
10628         }
10629         if (!list_empty(&block_group->io_list)) {
10630                 WARN_ON(1);
10631         }
10632         spin_unlock(&trans->transaction->dirty_bgs_lock);
10633         btrfs_remove_free_space_cache(block_group);
10634
10635         spin_lock(&block_group->space_info->lock);
10636         list_del_init(&block_group->ro_list);
10637
10638         if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
10639                 WARN_ON(block_group->space_info->total_bytes
10640                         < block_group->key.offset);
10641                 WARN_ON(block_group->space_info->bytes_readonly
10642                         < block_group->key.offset);
10643                 WARN_ON(block_group->space_info->disk_total
10644                         < block_group->key.offset * factor);
10645         }
10646         block_group->space_info->total_bytes -= block_group->key.offset;
10647         block_group->space_info->bytes_readonly -= block_group->key.offset;
10648         block_group->space_info->disk_total -= block_group->key.offset * factor;
10649
10650         spin_unlock(&block_group->space_info->lock);
10651
10652         memcpy(&key, &block_group->key, sizeof(key));
10653
10654         lock_chunks(root);
10655         if (!list_empty(&em->list)) {
10656                 /* We're in the transaction->pending_chunks list. */
10657                 free_extent_map(em);
10658         }
10659         spin_lock(&block_group->lock);
10660         block_group->removed = 1;
10661         /*
10662          * At this point trimming can't start on this block group, because we
10663          * removed the block group from the tree fs_info->block_group_cache_tree
10664          * so no one can't find it anymore and even if someone already got this
10665          * block group before we removed it from the rbtree, they have already
10666          * incremented block_group->trimming - if they didn't, they won't find
10667          * any free space entries because we already removed them all when we
10668          * called btrfs_remove_free_space_cache().
10669          *
10670          * And we must not remove the extent map from the fs_info->mapping_tree
10671          * to prevent the same logical address range and physical device space
10672          * ranges from being reused for a new block group. This is because our
10673          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10674          * completely transactionless, so while it is trimming a range the
10675          * currently running transaction might finish and a new one start,
10676          * allowing for new block groups to be created that can reuse the same
10677          * physical device locations unless we take this special care.
10678          *
10679          * There may also be an implicit trim operation if the file system
10680          * is mounted with -odiscard. The same protections must remain
10681          * in place until the extents have been discarded completely when
10682          * the transaction commit has completed.
10683          */
10684         remove_em = (atomic_read(&block_group->trimming) == 0);
10685         /*
10686          * Make sure a trimmer task always sees the em in the pinned_chunks list
10687          * if it sees block_group->removed == 1 (needs to lock block_group->lock
10688          * before checking block_group->removed).
10689          */
10690         if (!remove_em) {
10691                 /*
10692                  * Our em might be in trans->transaction->pending_chunks which
10693                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10694                  * and so is the fs_info->pinned_chunks list.
10695                  *
10696                  * So at this point we must be holding the chunk_mutex to avoid
10697                  * any races with chunk allocation (more specifically at
10698                  * volumes.c:contains_pending_extent()), to ensure it always
10699                  * sees the em, either in the pending_chunks list or in the
10700                  * pinned_chunks list.
10701                  */
10702                 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
10703         }
10704         spin_unlock(&block_group->lock);
10705
10706         if (remove_em) {
10707                 struct extent_map_tree *em_tree;
10708
10709                 em_tree = &root->fs_info->mapping_tree.map_tree;
10710                 write_lock(&em_tree->lock);
10711                 /*
10712                  * The em might be in the pending_chunks list, so make sure the
10713                  * chunk mutex is locked, since remove_extent_mapping() will
10714                  * delete us from that list.
10715                  */
10716                 remove_extent_mapping(em_tree, em);
10717                 write_unlock(&em_tree->lock);
10718                 /* once for the tree */
10719                 free_extent_map(em);
10720         }
10721
10722         unlock_chunks(root);
10723
10724         ret = remove_block_group_free_space(trans, root->fs_info, block_group);
10725         if (ret)
10726                 goto out;
10727
10728         btrfs_put_block_group(block_group);
10729         btrfs_put_block_group(block_group);
10730
10731         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10732         if (ret > 0)
10733                 ret = -EIO;
10734         if (ret < 0)
10735                 goto out;
10736
10737         ret = btrfs_del_item(trans, root, path);
10738 out:
10739         btrfs_free_path(path);
10740         return ret;
10741 }
10742
10743 struct btrfs_trans_handle *
10744 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10745                                      const u64 chunk_offset)
10746 {
10747         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10748         struct extent_map *em;
10749         struct map_lookup *map;
10750         unsigned int num_items;
10751
10752         read_lock(&em_tree->lock);
10753         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10754         read_unlock(&em_tree->lock);
10755         ASSERT(em && em->start == chunk_offset);
10756
10757         /*
10758          * We need to reserve 3 + N units from the metadata space info in order
10759          * to remove a block group (done at btrfs_remove_chunk() and at
10760          * btrfs_remove_block_group()), which are used for:
10761          *
10762          * 1 unit for adding the free space inode's orphan (located in the tree
10763          * of tree roots).
10764          * 1 unit for deleting the block group item (located in the extent
10765          * tree).
10766          * 1 unit for deleting the free space item (located in tree of tree
10767          * roots).
10768          * N units for deleting N device extent items corresponding to each
10769          * stripe (located in the device tree).
10770          *
10771          * In order to remove a block group we also need to reserve units in the
10772          * system space info in order to update the chunk tree (update one or
10773          * more device items and remove one chunk item), but this is done at
10774          * btrfs_remove_chunk() through a call to check_system_chunk().
10775          */
10776         map = em->map_lookup;
10777         num_items = 3 + map->num_stripes;
10778         free_extent_map(em);
10779
10780         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10781                                                            num_items, 1);
10782 }
10783
10784 /*
10785  * Process the unused_bgs list and remove any that don't have any allocated
10786  * space inside of them.
10787  */
10788 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10789 {
10790         struct btrfs_block_group_cache *block_group;
10791         struct btrfs_space_info *space_info;
10792         struct btrfs_root *root = fs_info->extent_root;
10793         struct btrfs_trans_handle *trans;
10794         int ret = 0;
10795
10796         if (!fs_info->open)
10797                 return;
10798
10799         spin_lock(&fs_info->unused_bgs_lock);
10800         while (!list_empty(&fs_info->unused_bgs)) {
10801                 u64 start, end;
10802                 int trimming;
10803
10804                 block_group = list_first_entry(&fs_info->unused_bgs,
10805                                                struct btrfs_block_group_cache,
10806                                                bg_list);
10807                 list_del_init(&block_group->bg_list);
10808
10809                 space_info = block_group->space_info;
10810
10811                 if (ret || btrfs_mixed_space_info(space_info)) {
10812                         btrfs_put_block_group(block_group);
10813                         continue;
10814                 }
10815                 spin_unlock(&fs_info->unused_bgs_lock);
10816
10817                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10818
10819                 /* Don't want to race with allocators so take the groups_sem */
10820                 down_write(&space_info->groups_sem);
10821                 spin_lock(&block_group->lock);
10822                 if (block_group->reserved ||
10823                     btrfs_block_group_used(&block_group->item) ||
10824                     block_group->ro ||
10825                     list_is_singular(&block_group->list)) {
10826                         /*
10827                          * We want to bail if we made new allocations or have
10828                          * outstanding allocations in this block group.  We do
10829                          * the ro check in case balance is currently acting on
10830                          * this block group.
10831                          */
10832                         spin_unlock(&block_group->lock);
10833                         up_write(&space_info->groups_sem);
10834                         goto next;
10835                 }
10836                 spin_unlock(&block_group->lock);
10837
10838                 /* We don't want to force the issue, only flip if it's ok. */
10839                 ret = inc_block_group_ro(block_group, 0);
10840                 up_write(&space_info->groups_sem);
10841                 if (ret < 0) {
10842                         ret = 0;
10843                         goto next;
10844                 }
10845
10846                 /*
10847                  * Want to do this before we do anything else so we can recover
10848                  * properly if we fail to join the transaction.
10849                  */
10850                 trans = btrfs_start_trans_remove_block_group(fs_info,
10851                                                      block_group->key.objectid);
10852                 if (IS_ERR(trans)) {
10853                         btrfs_dec_block_group_ro(root, block_group);
10854                         ret = PTR_ERR(trans);
10855                         goto next;
10856                 }
10857
10858                 /*
10859                  * We could have pending pinned extents for this block group,
10860                  * just delete them, we don't care about them anymore.
10861                  */
10862                 start = block_group->key.objectid;
10863                 end = start + block_group->key.offset - 1;
10864                 /*
10865                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10866                  * btrfs_finish_extent_commit(). If we are at transaction N,
10867                  * another task might be running finish_extent_commit() for the
10868                  * previous transaction N - 1, and have seen a range belonging
10869                  * to the block group in freed_extents[] before we were able to
10870                  * clear the whole block group range from freed_extents[]. This
10871                  * means that task can lookup for the block group after we
10872                  * unpinned it from freed_extents[] and removed it, leading to
10873                  * a BUG_ON() at btrfs_unpin_extent_range().
10874                  */
10875                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10876                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10877                                   EXTENT_DIRTY);
10878                 if (ret) {
10879                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10880                         btrfs_dec_block_group_ro(root, block_group);
10881                         goto end_trans;
10882                 }
10883                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10884                                   EXTENT_DIRTY);
10885                 if (ret) {
10886                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10887                         btrfs_dec_block_group_ro(root, block_group);
10888                         goto end_trans;
10889                 }
10890                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10891
10892                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10893                 spin_lock(&space_info->lock);
10894                 spin_lock(&block_group->lock);
10895
10896                 space_info->bytes_pinned -= block_group->pinned;
10897                 space_info->bytes_readonly += block_group->pinned;
10898                 percpu_counter_add(&space_info->total_bytes_pinned,
10899                                    -block_group->pinned);
10900                 block_group->pinned = 0;
10901
10902                 spin_unlock(&block_group->lock);
10903                 spin_unlock(&space_info->lock);
10904
10905                 /* DISCARD can flip during remount */
10906                 trimming = btrfs_test_opt(root->fs_info, DISCARD);
10907
10908                 /* Implicit trim during transaction commit. */
10909                 if (trimming)
10910                         btrfs_get_block_group_trimming(block_group);
10911
10912                 /*
10913                  * Btrfs_remove_chunk will abort the transaction if things go
10914                  * horribly wrong.
10915                  */
10916                 ret = btrfs_remove_chunk(trans, root,
10917                                          block_group->key.objectid);
10918
10919                 if (ret) {
10920                         if (trimming)
10921                                 btrfs_put_block_group_trimming(block_group);
10922                         goto end_trans;
10923                 }
10924
10925                 /*
10926                  * If we're not mounted with -odiscard, we can just forget
10927                  * about this block group. Otherwise we'll need to wait
10928                  * until transaction commit to do the actual discard.
10929                  */
10930                 if (trimming) {
10931                         spin_lock(&fs_info->unused_bgs_lock);
10932                         /*
10933                          * A concurrent scrub might have added us to the list
10934                          * fs_info->unused_bgs, so use a list_move operation
10935                          * to add the block group to the deleted_bgs list.
10936                          */
10937                         list_move(&block_group->bg_list,
10938                                   &trans->transaction->deleted_bgs);
10939                         spin_unlock(&fs_info->unused_bgs_lock);
10940                         btrfs_get_block_group(block_group);
10941                 }
10942 end_trans:
10943                 btrfs_end_transaction(trans, root);
10944 next:
10945                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10946                 btrfs_put_block_group(block_group);
10947                 spin_lock(&fs_info->unused_bgs_lock);
10948         }
10949         spin_unlock(&fs_info->unused_bgs_lock);
10950 }
10951
10952 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10953 {
10954         struct btrfs_space_info *space_info;
10955         struct btrfs_super_block *disk_super;
10956         u64 features;
10957         u64 flags;
10958         int mixed = 0;
10959         int ret;
10960
10961         disk_super = fs_info->super_copy;
10962         if (!btrfs_super_root(disk_super))
10963                 return -EINVAL;
10964
10965         features = btrfs_super_incompat_flags(disk_super);
10966         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10967                 mixed = 1;
10968
10969         flags = BTRFS_BLOCK_GROUP_SYSTEM;
10970         ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10971         if (ret)
10972                 goto out;
10973
10974         if (mixed) {
10975                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10976                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10977         } else {
10978                 flags = BTRFS_BLOCK_GROUP_METADATA;
10979                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10980                 if (ret)
10981                         goto out;
10982
10983                 flags = BTRFS_BLOCK_GROUP_DATA;
10984                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10985         }
10986 out:
10987         return ret;
10988 }
10989
10990 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
10991 {
10992         return unpin_extent_range(root, start, end, false);
10993 }
10994
10995 /*
10996  * It used to be that old block groups would be left around forever.
10997  * Iterating over them would be enough to trim unused space.  Since we
10998  * now automatically remove them, we also need to iterate over unallocated
10999  * space.
11000  *
11001  * We don't want a transaction for this since the discard may take a
11002  * substantial amount of time.  We don't require that a transaction be
11003  * running, but we do need to take a running transaction into account
11004  * to ensure that we're not discarding chunks that were released in
11005  * the current transaction.
11006  *
11007  * Holding the chunks lock will prevent other threads from allocating
11008  * or releasing chunks, but it won't prevent a running transaction
11009  * from committing and releasing the memory that the pending chunks
11010  * list head uses.  For that, we need to take a reference to the
11011  * transaction.
11012  */
11013 static int btrfs_trim_free_extents(struct btrfs_device *device,
11014                                    u64 minlen, u64 *trimmed)
11015 {
11016         u64 start = 0, len = 0;
11017         int ret;
11018
11019         *trimmed = 0;
11020
11021         /* Not writeable = nothing to do. */
11022         if (!device->writeable)
11023                 return 0;
11024
11025         /* No free space = nothing to do. */
11026         if (device->total_bytes <= device->bytes_used)
11027                 return 0;
11028
11029         ret = 0;
11030
11031         while (1) {
11032                 struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
11033                 struct btrfs_transaction *trans;
11034                 u64 bytes;
11035
11036                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11037                 if (ret)
11038                         return ret;
11039
11040                 down_read(&fs_info->commit_root_sem);
11041
11042                 spin_lock(&fs_info->trans_lock);
11043                 trans = fs_info->running_transaction;
11044                 if (trans)
11045                         atomic_inc(&trans->use_count);
11046                 spin_unlock(&fs_info->trans_lock);
11047
11048                 ret = find_free_dev_extent_start(trans, device, minlen, start,
11049                                                  &start, &len);
11050                 if (trans)
11051                         btrfs_put_transaction(trans);
11052
11053                 if (ret) {
11054                         up_read(&fs_info->commit_root_sem);
11055                         mutex_unlock(&fs_info->chunk_mutex);
11056                         if (ret == -ENOSPC)
11057                                 ret = 0;
11058                         break;
11059                 }
11060
11061                 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11062                 up_read(&fs_info->commit_root_sem);
11063                 mutex_unlock(&fs_info->chunk_mutex);
11064
11065                 if (ret)
11066                         break;
11067
11068                 start += len;
11069                 *trimmed += bytes;
11070
11071                 if (fatal_signal_pending(current)) {
11072                         ret = -ERESTARTSYS;
11073                         break;
11074                 }
11075
11076                 cond_resched();
11077         }
11078
11079         return ret;
11080 }
11081
11082 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
11083 {
11084         struct btrfs_fs_info *fs_info = root->fs_info;
11085         struct btrfs_block_group_cache *cache = NULL;
11086         struct btrfs_device *device;
11087         struct list_head *devices;
11088         u64 group_trimmed;
11089         u64 start;
11090         u64 end;
11091         u64 trimmed = 0;
11092         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
11093         int ret = 0;
11094
11095         /*
11096          * try to trim all FS space, our block group may start from non-zero.
11097          */
11098         if (range->len == total_bytes)
11099                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
11100         else
11101                 cache = btrfs_lookup_block_group(fs_info, range->start);
11102
11103         while (cache) {
11104                 if (cache->key.objectid >= (range->start + range->len)) {
11105                         btrfs_put_block_group(cache);
11106                         break;
11107                 }
11108
11109                 start = max(range->start, cache->key.objectid);
11110                 end = min(range->start + range->len,
11111                                 cache->key.objectid + cache->key.offset);
11112
11113                 if (end - start >= range->minlen) {
11114                         if (!block_group_cache_done(cache)) {
11115                                 ret = cache_block_group(cache, 0);
11116                                 if (ret) {
11117                                         btrfs_put_block_group(cache);
11118                                         break;
11119                                 }
11120                                 ret = wait_block_group_cache_done(cache);
11121                                 if (ret) {
11122                                         btrfs_put_block_group(cache);
11123                                         break;
11124                                 }
11125                         }
11126                         ret = btrfs_trim_block_group(cache,
11127                                                      &group_trimmed,
11128                                                      start,
11129                                                      end,
11130                                                      range->minlen);
11131
11132                         trimmed += group_trimmed;
11133                         if (ret) {
11134                                 btrfs_put_block_group(cache);
11135                                 break;
11136                         }
11137                 }
11138
11139                 cache = next_block_group(fs_info->tree_root, cache);
11140         }
11141
11142         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
11143         devices = &root->fs_info->fs_devices->alloc_list;
11144         list_for_each_entry(device, devices, dev_alloc_list) {
11145                 ret = btrfs_trim_free_extents(device, range->minlen,
11146                                               &group_trimmed);
11147                 if (ret)
11148                         break;
11149
11150                 trimmed += group_trimmed;
11151         }
11152         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
11153
11154         range->len = trimmed;
11155         return ret;
11156 }
11157
11158 /*
11159  * btrfs_{start,end}_write_no_snapshoting() are similar to
11160  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11161  * data into the page cache through nocow before the subvolume is snapshoted,
11162  * but flush the data into disk after the snapshot creation, or to prevent
11163  * operations while snapshoting is ongoing and that cause the snapshot to be
11164  * inconsistent (writes followed by expanding truncates for example).
11165  */
11166 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
11167 {
11168         percpu_counter_dec(&root->subv_writers->counter);
11169         /*
11170          * Make sure counter is updated before we wake up waiters.
11171          */
11172         smp_mb();
11173         if (waitqueue_active(&root->subv_writers->wait))
11174                 wake_up(&root->subv_writers->wait);
11175 }
11176
11177 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
11178 {
11179         if (atomic_read(&root->will_be_snapshoted))
11180                 return 0;
11181
11182         percpu_counter_inc(&root->subv_writers->counter);
11183         /*
11184          * Make sure counter is updated before we check for snapshot creation.
11185          */
11186         smp_mb();
11187         if (atomic_read(&root->will_be_snapshoted)) {
11188                 btrfs_end_write_no_snapshoting(root);
11189                 return 0;
11190         }
11191         return 1;
11192 }
11193
11194 static int wait_snapshoting_atomic_t(atomic_t *a)
11195 {
11196         schedule();
11197         return 0;
11198 }
11199
11200 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11201 {
11202         while (true) {
11203                 int ret;
11204
11205                 ret = btrfs_start_write_no_snapshoting(root);
11206                 if (ret)
11207                         break;
11208                 wait_on_atomic_t(&root->will_be_snapshoted,
11209                                  wait_snapshoting_atomic_t,
11210                                  TASK_UNINTERRUPTIBLE);
11211         }
11212 }