btrfs: skip superblocks during discard
[cascardo/linux.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/percpu_counter.h>
28 #include "hash.h"
29 #include "tree-log.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "volumes.h"
33 #include "raid56.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 #include "math.h"
37 #include "sysfs.h"
38 #include "qgroup.h"
39
40 #undef SCRAMBLE_DELAYED_REFS
41
42 /*
43  * control flags for do_chunk_alloc's force field
44  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
45  * if we really need one.
46  *
47  * CHUNK_ALLOC_LIMITED means to only try and allocate one
48  * if we have very few chunks already allocated.  This is
49  * used as part of the clustering code to help make sure
50  * we have a good pool of storage to cluster in, without
51  * filling the FS with empty chunks
52  *
53  * CHUNK_ALLOC_FORCE means it must try to allocate one
54  *
55  */
56 enum {
57         CHUNK_ALLOC_NO_FORCE = 0,
58         CHUNK_ALLOC_LIMITED = 1,
59         CHUNK_ALLOC_FORCE = 2,
60 };
61
62 /*
63  * Control how reservations are dealt with.
64  *
65  * RESERVE_FREE - freeing a reservation.
66  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
67  *   ENOSPC accounting
68  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
69  *   bytes_may_use as the ENOSPC accounting is done elsewhere
70  */
71 enum {
72         RESERVE_FREE = 0,
73         RESERVE_ALLOC = 1,
74         RESERVE_ALLOC_NO_ACCOUNT = 2,
75 };
76
77 static int update_block_group(struct btrfs_trans_handle *trans,
78                               struct btrfs_root *root, u64 bytenr,
79                               u64 num_bytes, int alloc);
80 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
81                                 struct btrfs_root *root,
82                                 struct btrfs_delayed_ref_node *node, u64 parent,
83                                 u64 root_objectid, u64 owner_objectid,
84                                 u64 owner_offset, int refs_to_drop,
85                                 struct btrfs_delayed_extent_op *extra_op);
86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
87                                     struct extent_buffer *leaf,
88                                     struct btrfs_extent_item *ei);
89 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
90                                       struct btrfs_root *root,
91                                       u64 parent, u64 root_objectid,
92                                       u64 flags, u64 owner, u64 offset,
93                                       struct btrfs_key *ins, int ref_mod);
94 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
95                                      struct btrfs_root *root,
96                                      u64 parent, u64 root_objectid,
97                                      u64 flags, struct btrfs_disk_key *key,
98                                      int level, struct btrfs_key *ins,
99                                      int no_quota);
100 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
101                           struct btrfs_root *extent_root, u64 flags,
102                           int force);
103 static int find_next_key(struct btrfs_path *path, int level,
104                          struct btrfs_key *key);
105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
106                             int dump_block_groups);
107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
108                                        u64 num_bytes, int reserve,
109                                        int delalloc);
110 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
111                                u64 num_bytes);
112 int btrfs_pin_extent(struct btrfs_root *root,
113                      u64 bytenr, u64 num_bytes, int reserved);
114
115 static noinline int
116 block_group_cache_done(struct btrfs_block_group_cache *cache)
117 {
118         smp_mb();
119         return cache->cached == BTRFS_CACHE_FINISHED ||
120                 cache->cached == BTRFS_CACHE_ERROR;
121 }
122
123 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
124 {
125         return (cache->flags & bits) == bits;
126 }
127
128 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
129 {
130         atomic_inc(&cache->count);
131 }
132
133 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
134 {
135         if (atomic_dec_and_test(&cache->count)) {
136                 WARN_ON(cache->pinned > 0);
137                 WARN_ON(cache->reserved > 0);
138                 kfree(cache->free_space_ctl);
139                 kfree(cache);
140         }
141 }
142
143 /*
144  * this adds the block group to the fs_info rb tree for the block group
145  * cache
146  */
147 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
148                                 struct btrfs_block_group_cache *block_group)
149 {
150         struct rb_node **p;
151         struct rb_node *parent = NULL;
152         struct btrfs_block_group_cache *cache;
153
154         spin_lock(&info->block_group_cache_lock);
155         p = &info->block_group_cache_tree.rb_node;
156
157         while (*p) {
158                 parent = *p;
159                 cache = rb_entry(parent, struct btrfs_block_group_cache,
160                                  cache_node);
161                 if (block_group->key.objectid < cache->key.objectid) {
162                         p = &(*p)->rb_left;
163                 } else if (block_group->key.objectid > cache->key.objectid) {
164                         p = &(*p)->rb_right;
165                 } else {
166                         spin_unlock(&info->block_group_cache_lock);
167                         return -EEXIST;
168                 }
169         }
170
171         rb_link_node(&block_group->cache_node, parent, p);
172         rb_insert_color(&block_group->cache_node,
173                         &info->block_group_cache_tree);
174
175         if (info->first_logical_byte > block_group->key.objectid)
176                 info->first_logical_byte = block_group->key.objectid;
177
178         spin_unlock(&info->block_group_cache_lock);
179
180         return 0;
181 }
182
183 /*
184  * This will return the block group at or after bytenr if contains is 0, else
185  * it will return the block group that contains the bytenr
186  */
187 static struct btrfs_block_group_cache *
188 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
189                               int contains)
190 {
191         struct btrfs_block_group_cache *cache, *ret = NULL;
192         struct rb_node *n;
193         u64 end, start;
194
195         spin_lock(&info->block_group_cache_lock);
196         n = info->block_group_cache_tree.rb_node;
197
198         while (n) {
199                 cache = rb_entry(n, struct btrfs_block_group_cache,
200                                  cache_node);
201                 end = cache->key.objectid + cache->key.offset - 1;
202                 start = cache->key.objectid;
203
204                 if (bytenr < start) {
205                         if (!contains && (!ret || start < ret->key.objectid))
206                                 ret = cache;
207                         n = n->rb_left;
208                 } else if (bytenr > start) {
209                         if (contains && bytenr <= end) {
210                                 ret = cache;
211                                 break;
212                         }
213                         n = n->rb_right;
214                 } else {
215                         ret = cache;
216                         break;
217                 }
218         }
219         if (ret) {
220                 btrfs_get_block_group(ret);
221                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
222                         info->first_logical_byte = ret->key.objectid;
223         }
224         spin_unlock(&info->block_group_cache_lock);
225
226         return ret;
227 }
228
229 static int add_excluded_extent(struct btrfs_root *root,
230                                u64 start, u64 num_bytes)
231 {
232         u64 end = start + num_bytes - 1;
233         set_extent_bits(&root->fs_info->freed_extents[0],
234                         start, end, EXTENT_UPTODATE, GFP_NOFS);
235         set_extent_bits(&root->fs_info->freed_extents[1],
236                         start, end, EXTENT_UPTODATE, GFP_NOFS);
237         return 0;
238 }
239
240 static void free_excluded_extents(struct btrfs_root *root,
241                                   struct btrfs_block_group_cache *cache)
242 {
243         u64 start, end;
244
245         start = cache->key.objectid;
246         end = start + cache->key.offset - 1;
247
248         clear_extent_bits(&root->fs_info->freed_extents[0],
249                           start, end, EXTENT_UPTODATE, GFP_NOFS);
250         clear_extent_bits(&root->fs_info->freed_extents[1],
251                           start, end, EXTENT_UPTODATE, GFP_NOFS);
252 }
253
254 static int exclude_super_stripes(struct btrfs_root *root,
255                                  struct btrfs_block_group_cache *cache)
256 {
257         u64 bytenr;
258         u64 *logical;
259         int stripe_len;
260         int i, nr, ret;
261
262         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
263                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
264                 cache->bytes_super += stripe_len;
265                 ret = add_excluded_extent(root, cache->key.objectid,
266                                           stripe_len);
267                 if (ret)
268                         return ret;
269         }
270
271         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
272                 bytenr = btrfs_sb_offset(i);
273                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
274                                        cache->key.objectid, bytenr,
275                                        0, &logical, &nr, &stripe_len);
276                 if (ret)
277                         return ret;
278
279                 while (nr--) {
280                         u64 start, len;
281
282                         if (logical[nr] > cache->key.objectid +
283                             cache->key.offset)
284                                 continue;
285
286                         if (logical[nr] + stripe_len <= cache->key.objectid)
287                                 continue;
288
289                         start = logical[nr];
290                         if (start < cache->key.objectid) {
291                                 start = cache->key.objectid;
292                                 len = (logical[nr] + stripe_len) - start;
293                         } else {
294                                 len = min_t(u64, stripe_len,
295                                             cache->key.objectid +
296                                             cache->key.offset - start);
297                         }
298
299                         cache->bytes_super += len;
300                         ret = add_excluded_extent(root, start, len);
301                         if (ret) {
302                                 kfree(logical);
303                                 return ret;
304                         }
305                 }
306
307                 kfree(logical);
308         }
309         return 0;
310 }
311
312 static struct btrfs_caching_control *
313 get_caching_control(struct btrfs_block_group_cache *cache)
314 {
315         struct btrfs_caching_control *ctl;
316
317         spin_lock(&cache->lock);
318         if (!cache->caching_ctl) {
319                 spin_unlock(&cache->lock);
320                 return NULL;
321         }
322
323         ctl = cache->caching_ctl;
324         atomic_inc(&ctl->count);
325         spin_unlock(&cache->lock);
326         return ctl;
327 }
328
329 static void put_caching_control(struct btrfs_caching_control *ctl)
330 {
331         if (atomic_dec_and_test(&ctl->count))
332                 kfree(ctl);
333 }
334
335 /*
336  * this is only called by cache_block_group, since we could have freed extents
337  * we need to check the pinned_extents for any extents that can't be used yet
338  * since their free space will be released as soon as the transaction commits.
339  */
340 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
341                               struct btrfs_fs_info *info, u64 start, u64 end)
342 {
343         u64 extent_start, extent_end, size, total_added = 0;
344         int ret;
345
346         while (start < end) {
347                 ret = find_first_extent_bit(info->pinned_extents, start,
348                                             &extent_start, &extent_end,
349                                             EXTENT_DIRTY | EXTENT_UPTODATE,
350                                             NULL);
351                 if (ret)
352                         break;
353
354                 if (extent_start <= start) {
355                         start = extent_end + 1;
356                 } else if (extent_start > start && extent_start < end) {
357                         size = extent_start - start;
358                         total_added += size;
359                         ret = btrfs_add_free_space(block_group, start,
360                                                    size);
361                         BUG_ON(ret); /* -ENOMEM or logic error */
362                         start = extent_end + 1;
363                 } else {
364                         break;
365                 }
366         }
367
368         if (start < end) {
369                 size = end - start;
370                 total_added += size;
371                 ret = btrfs_add_free_space(block_group, start, size);
372                 BUG_ON(ret); /* -ENOMEM or logic error */
373         }
374
375         return total_added;
376 }
377
378 static noinline void caching_thread(struct btrfs_work *work)
379 {
380         struct btrfs_block_group_cache *block_group;
381         struct btrfs_fs_info *fs_info;
382         struct btrfs_caching_control *caching_ctl;
383         struct btrfs_root *extent_root;
384         struct btrfs_path *path;
385         struct extent_buffer *leaf;
386         struct btrfs_key key;
387         u64 total_found = 0;
388         u64 last = 0;
389         u32 nritems;
390         int ret = -ENOMEM;
391
392         caching_ctl = container_of(work, struct btrfs_caching_control, work);
393         block_group = caching_ctl->block_group;
394         fs_info = block_group->fs_info;
395         extent_root = fs_info->extent_root;
396
397         path = btrfs_alloc_path();
398         if (!path)
399                 goto out;
400
401         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
402
403         /*
404          * We don't want to deadlock with somebody trying to allocate a new
405          * extent for the extent root while also trying to search the extent
406          * root to add free space.  So we skip locking and search the commit
407          * root, since its read-only
408          */
409         path->skip_locking = 1;
410         path->search_commit_root = 1;
411         path->reada = 1;
412
413         key.objectid = last;
414         key.offset = 0;
415         key.type = BTRFS_EXTENT_ITEM_KEY;
416 again:
417         mutex_lock(&caching_ctl->mutex);
418         /* need to make sure the commit_root doesn't disappear */
419         down_read(&fs_info->commit_root_sem);
420
421 next:
422         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
423         if (ret < 0)
424                 goto err;
425
426         leaf = path->nodes[0];
427         nritems = btrfs_header_nritems(leaf);
428
429         while (1) {
430                 if (btrfs_fs_closing(fs_info) > 1) {
431                         last = (u64)-1;
432                         break;
433                 }
434
435                 if (path->slots[0] < nritems) {
436                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
437                 } else {
438                         ret = find_next_key(path, 0, &key);
439                         if (ret)
440                                 break;
441
442                         if (need_resched() ||
443                             rwsem_is_contended(&fs_info->commit_root_sem)) {
444                                 caching_ctl->progress = last;
445                                 btrfs_release_path(path);
446                                 up_read(&fs_info->commit_root_sem);
447                                 mutex_unlock(&caching_ctl->mutex);
448                                 cond_resched();
449                                 goto again;
450                         }
451
452                         ret = btrfs_next_leaf(extent_root, path);
453                         if (ret < 0)
454                                 goto err;
455                         if (ret)
456                                 break;
457                         leaf = path->nodes[0];
458                         nritems = btrfs_header_nritems(leaf);
459                         continue;
460                 }
461
462                 if (key.objectid < last) {
463                         key.objectid = last;
464                         key.offset = 0;
465                         key.type = BTRFS_EXTENT_ITEM_KEY;
466
467                         caching_ctl->progress = last;
468                         btrfs_release_path(path);
469                         goto next;
470                 }
471
472                 if (key.objectid < block_group->key.objectid) {
473                         path->slots[0]++;
474                         continue;
475                 }
476
477                 if (key.objectid >= block_group->key.objectid +
478                     block_group->key.offset)
479                         break;
480
481                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
482                     key.type == BTRFS_METADATA_ITEM_KEY) {
483                         total_found += add_new_free_space(block_group,
484                                                           fs_info, last,
485                                                           key.objectid);
486                         if (key.type == BTRFS_METADATA_ITEM_KEY)
487                                 last = key.objectid +
488                                         fs_info->tree_root->nodesize;
489                         else
490                                 last = key.objectid + key.offset;
491
492                         if (total_found > (1024 * 1024 * 2)) {
493                                 total_found = 0;
494                                 wake_up(&caching_ctl->wait);
495                         }
496                 }
497                 path->slots[0]++;
498         }
499         ret = 0;
500
501         total_found += add_new_free_space(block_group, fs_info, last,
502                                           block_group->key.objectid +
503                                           block_group->key.offset);
504         caching_ctl->progress = (u64)-1;
505
506         spin_lock(&block_group->lock);
507         block_group->caching_ctl = NULL;
508         block_group->cached = BTRFS_CACHE_FINISHED;
509         spin_unlock(&block_group->lock);
510
511 err:
512         btrfs_free_path(path);
513         up_read(&fs_info->commit_root_sem);
514
515         free_excluded_extents(extent_root, block_group);
516
517         mutex_unlock(&caching_ctl->mutex);
518 out:
519         if (ret) {
520                 spin_lock(&block_group->lock);
521                 block_group->caching_ctl = NULL;
522                 block_group->cached = BTRFS_CACHE_ERROR;
523                 spin_unlock(&block_group->lock);
524         }
525         wake_up(&caching_ctl->wait);
526
527         put_caching_control(caching_ctl);
528         btrfs_put_block_group(block_group);
529 }
530
531 static int cache_block_group(struct btrfs_block_group_cache *cache,
532                              int load_cache_only)
533 {
534         DEFINE_WAIT(wait);
535         struct btrfs_fs_info *fs_info = cache->fs_info;
536         struct btrfs_caching_control *caching_ctl;
537         int ret = 0;
538
539         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
540         if (!caching_ctl)
541                 return -ENOMEM;
542
543         INIT_LIST_HEAD(&caching_ctl->list);
544         mutex_init(&caching_ctl->mutex);
545         init_waitqueue_head(&caching_ctl->wait);
546         caching_ctl->block_group = cache;
547         caching_ctl->progress = cache->key.objectid;
548         atomic_set(&caching_ctl->count, 1);
549         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
550                         caching_thread, NULL, NULL);
551
552         spin_lock(&cache->lock);
553         /*
554          * This should be a rare occasion, but this could happen I think in the
555          * case where one thread starts to load the space cache info, and then
556          * some other thread starts a transaction commit which tries to do an
557          * allocation while the other thread is still loading the space cache
558          * info.  The previous loop should have kept us from choosing this block
559          * group, but if we've moved to the state where we will wait on caching
560          * block groups we need to first check if we're doing a fast load here,
561          * so we can wait for it to finish, otherwise we could end up allocating
562          * from a block group who's cache gets evicted for one reason or
563          * another.
564          */
565         while (cache->cached == BTRFS_CACHE_FAST) {
566                 struct btrfs_caching_control *ctl;
567
568                 ctl = cache->caching_ctl;
569                 atomic_inc(&ctl->count);
570                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
571                 spin_unlock(&cache->lock);
572
573                 schedule();
574
575                 finish_wait(&ctl->wait, &wait);
576                 put_caching_control(ctl);
577                 spin_lock(&cache->lock);
578         }
579
580         if (cache->cached != BTRFS_CACHE_NO) {
581                 spin_unlock(&cache->lock);
582                 kfree(caching_ctl);
583                 return 0;
584         }
585         WARN_ON(cache->caching_ctl);
586         cache->caching_ctl = caching_ctl;
587         cache->cached = BTRFS_CACHE_FAST;
588         spin_unlock(&cache->lock);
589
590         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
591                 mutex_lock(&caching_ctl->mutex);
592                 ret = load_free_space_cache(fs_info, cache);
593
594                 spin_lock(&cache->lock);
595                 if (ret == 1) {
596                         cache->caching_ctl = NULL;
597                         cache->cached = BTRFS_CACHE_FINISHED;
598                         cache->last_byte_to_unpin = (u64)-1;
599                         caching_ctl->progress = (u64)-1;
600                 } else {
601                         if (load_cache_only) {
602                                 cache->caching_ctl = NULL;
603                                 cache->cached = BTRFS_CACHE_NO;
604                         } else {
605                                 cache->cached = BTRFS_CACHE_STARTED;
606                                 cache->has_caching_ctl = 1;
607                         }
608                 }
609                 spin_unlock(&cache->lock);
610                 mutex_unlock(&caching_ctl->mutex);
611
612                 wake_up(&caching_ctl->wait);
613                 if (ret == 1) {
614                         put_caching_control(caching_ctl);
615                         free_excluded_extents(fs_info->extent_root, cache);
616                         return 0;
617                 }
618         } else {
619                 /*
620                  * We are not going to do the fast caching, set cached to the
621                  * appropriate value and wakeup any waiters.
622                  */
623                 spin_lock(&cache->lock);
624                 if (load_cache_only) {
625                         cache->caching_ctl = NULL;
626                         cache->cached = BTRFS_CACHE_NO;
627                 } else {
628                         cache->cached = BTRFS_CACHE_STARTED;
629                         cache->has_caching_ctl = 1;
630                 }
631                 spin_unlock(&cache->lock);
632                 wake_up(&caching_ctl->wait);
633         }
634
635         if (load_cache_only) {
636                 put_caching_control(caching_ctl);
637                 return 0;
638         }
639
640         down_write(&fs_info->commit_root_sem);
641         atomic_inc(&caching_ctl->count);
642         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
643         up_write(&fs_info->commit_root_sem);
644
645         btrfs_get_block_group(cache);
646
647         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
648
649         return ret;
650 }
651
652 /*
653  * return the block group that starts at or after bytenr
654  */
655 static struct btrfs_block_group_cache *
656 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
657 {
658         struct btrfs_block_group_cache *cache;
659
660         cache = block_group_cache_tree_search(info, bytenr, 0);
661
662         return cache;
663 }
664
665 /*
666  * return the block group that contains the given bytenr
667  */
668 struct btrfs_block_group_cache *btrfs_lookup_block_group(
669                                                  struct btrfs_fs_info *info,
670                                                  u64 bytenr)
671 {
672         struct btrfs_block_group_cache *cache;
673
674         cache = block_group_cache_tree_search(info, bytenr, 1);
675
676         return cache;
677 }
678
679 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
680                                                   u64 flags)
681 {
682         struct list_head *head = &info->space_info;
683         struct btrfs_space_info *found;
684
685         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
686
687         rcu_read_lock();
688         list_for_each_entry_rcu(found, head, list) {
689                 if (found->flags & flags) {
690                         rcu_read_unlock();
691                         return found;
692                 }
693         }
694         rcu_read_unlock();
695         return NULL;
696 }
697
698 /*
699  * after adding space to the filesystem, we need to clear the full flags
700  * on all the space infos.
701  */
702 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
703 {
704         struct list_head *head = &info->space_info;
705         struct btrfs_space_info *found;
706
707         rcu_read_lock();
708         list_for_each_entry_rcu(found, head, list)
709                 found->full = 0;
710         rcu_read_unlock();
711 }
712
713 /* simple helper to search for an existing data extent at a given offset */
714 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
715 {
716         int ret;
717         struct btrfs_key key;
718         struct btrfs_path *path;
719
720         path = btrfs_alloc_path();
721         if (!path)
722                 return -ENOMEM;
723
724         key.objectid = start;
725         key.offset = len;
726         key.type = BTRFS_EXTENT_ITEM_KEY;
727         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
728                                 0, 0);
729         btrfs_free_path(path);
730         return ret;
731 }
732
733 /*
734  * helper function to lookup reference count and flags of a tree block.
735  *
736  * the head node for delayed ref is used to store the sum of all the
737  * reference count modifications queued up in the rbtree. the head
738  * node may also store the extent flags to set. This way you can check
739  * to see what the reference count and extent flags would be if all of
740  * the delayed refs are not processed.
741  */
742 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
743                              struct btrfs_root *root, u64 bytenr,
744                              u64 offset, int metadata, u64 *refs, u64 *flags)
745 {
746         struct btrfs_delayed_ref_head *head;
747         struct btrfs_delayed_ref_root *delayed_refs;
748         struct btrfs_path *path;
749         struct btrfs_extent_item *ei;
750         struct extent_buffer *leaf;
751         struct btrfs_key key;
752         u32 item_size;
753         u64 num_refs;
754         u64 extent_flags;
755         int ret;
756
757         /*
758          * If we don't have skinny metadata, don't bother doing anything
759          * different
760          */
761         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
762                 offset = root->nodesize;
763                 metadata = 0;
764         }
765
766         path = btrfs_alloc_path();
767         if (!path)
768                 return -ENOMEM;
769
770         if (!trans) {
771                 path->skip_locking = 1;
772                 path->search_commit_root = 1;
773         }
774
775 search_again:
776         key.objectid = bytenr;
777         key.offset = offset;
778         if (metadata)
779                 key.type = BTRFS_METADATA_ITEM_KEY;
780         else
781                 key.type = BTRFS_EXTENT_ITEM_KEY;
782
783         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
784                                 &key, path, 0, 0);
785         if (ret < 0)
786                 goto out_free;
787
788         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
789                 if (path->slots[0]) {
790                         path->slots[0]--;
791                         btrfs_item_key_to_cpu(path->nodes[0], &key,
792                                               path->slots[0]);
793                         if (key.objectid == bytenr &&
794                             key.type == BTRFS_EXTENT_ITEM_KEY &&
795                             key.offset == root->nodesize)
796                                 ret = 0;
797                 }
798         }
799
800         if (ret == 0) {
801                 leaf = path->nodes[0];
802                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
803                 if (item_size >= sizeof(*ei)) {
804                         ei = btrfs_item_ptr(leaf, path->slots[0],
805                                             struct btrfs_extent_item);
806                         num_refs = btrfs_extent_refs(leaf, ei);
807                         extent_flags = btrfs_extent_flags(leaf, ei);
808                 } else {
809 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
810                         struct btrfs_extent_item_v0 *ei0;
811                         BUG_ON(item_size != sizeof(*ei0));
812                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
813                                              struct btrfs_extent_item_v0);
814                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
815                         /* FIXME: this isn't correct for data */
816                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
817 #else
818                         BUG();
819 #endif
820                 }
821                 BUG_ON(num_refs == 0);
822         } else {
823                 num_refs = 0;
824                 extent_flags = 0;
825                 ret = 0;
826         }
827
828         if (!trans)
829                 goto out;
830
831         delayed_refs = &trans->transaction->delayed_refs;
832         spin_lock(&delayed_refs->lock);
833         head = btrfs_find_delayed_ref_head(trans, bytenr);
834         if (head) {
835                 if (!mutex_trylock(&head->mutex)) {
836                         atomic_inc(&head->node.refs);
837                         spin_unlock(&delayed_refs->lock);
838
839                         btrfs_release_path(path);
840
841                         /*
842                          * Mutex was contended, block until it's released and try
843                          * again
844                          */
845                         mutex_lock(&head->mutex);
846                         mutex_unlock(&head->mutex);
847                         btrfs_put_delayed_ref(&head->node);
848                         goto search_again;
849                 }
850                 spin_lock(&head->lock);
851                 if (head->extent_op && head->extent_op->update_flags)
852                         extent_flags |= head->extent_op->flags_to_set;
853                 else
854                         BUG_ON(num_refs == 0);
855
856                 num_refs += head->node.ref_mod;
857                 spin_unlock(&head->lock);
858                 mutex_unlock(&head->mutex);
859         }
860         spin_unlock(&delayed_refs->lock);
861 out:
862         WARN_ON(num_refs == 0);
863         if (refs)
864                 *refs = num_refs;
865         if (flags)
866                 *flags = extent_flags;
867 out_free:
868         btrfs_free_path(path);
869         return ret;
870 }
871
872 /*
873  * Back reference rules.  Back refs have three main goals:
874  *
875  * 1) differentiate between all holders of references to an extent so that
876  *    when a reference is dropped we can make sure it was a valid reference
877  *    before freeing the extent.
878  *
879  * 2) Provide enough information to quickly find the holders of an extent
880  *    if we notice a given block is corrupted or bad.
881  *
882  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
883  *    maintenance.  This is actually the same as #2, but with a slightly
884  *    different use case.
885  *
886  * There are two kinds of back refs. The implicit back refs is optimized
887  * for pointers in non-shared tree blocks. For a given pointer in a block,
888  * back refs of this kind provide information about the block's owner tree
889  * and the pointer's key. These information allow us to find the block by
890  * b-tree searching. The full back refs is for pointers in tree blocks not
891  * referenced by their owner trees. The location of tree block is recorded
892  * in the back refs. Actually the full back refs is generic, and can be
893  * used in all cases the implicit back refs is used. The major shortcoming
894  * of the full back refs is its overhead. Every time a tree block gets
895  * COWed, we have to update back refs entry for all pointers in it.
896  *
897  * For a newly allocated tree block, we use implicit back refs for
898  * pointers in it. This means most tree related operations only involve
899  * implicit back refs. For a tree block created in old transaction, the
900  * only way to drop a reference to it is COW it. So we can detect the
901  * event that tree block loses its owner tree's reference and do the
902  * back refs conversion.
903  *
904  * When a tree block is COW'd through a tree, there are four cases:
905  *
906  * The reference count of the block is one and the tree is the block's
907  * owner tree. Nothing to do in this case.
908  *
909  * The reference count of the block is one and the tree is not the
910  * block's owner tree. In this case, full back refs is used for pointers
911  * in the block. Remove these full back refs, add implicit back refs for
912  * every pointers in the new block.
913  *
914  * The reference count of the block is greater than one and the tree is
915  * the block's owner tree. In this case, implicit back refs is used for
916  * pointers in the block. Add full back refs for every pointers in the
917  * block, increase lower level extents' reference counts. The original
918  * implicit back refs are entailed to the new block.
919  *
920  * The reference count of the block is greater than one and the tree is
921  * not the block's owner tree. Add implicit back refs for every pointer in
922  * the new block, increase lower level extents' reference count.
923  *
924  * Back Reference Key composing:
925  *
926  * The key objectid corresponds to the first byte in the extent,
927  * The key type is used to differentiate between types of back refs.
928  * There are different meanings of the key offset for different types
929  * of back refs.
930  *
931  * File extents can be referenced by:
932  *
933  * - multiple snapshots, subvolumes, or different generations in one subvol
934  * - different files inside a single subvolume
935  * - different offsets inside a file (bookend extents in file.c)
936  *
937  * The extent ref structure for the implicit back refs has fields for:
938  *
939  * - Objectid of the subvolume root
940  * - objectid of the file holding the reference
941  * - original offset in the file
942  * - how many bookend extents
943  *
944  * The key offset for the implicit back refs is hash of the first
945  * three fields.
946  *
947  * The extent ref structure for the full back refs has field for:
948  *
949  * - number of pointers in the tree leaf
950  *
951  * The key offset for the implicit back refs is the first byte of
952  * the tree leaf
953  *
954  * When a file extent is allocated, The implicit back refs is used.
955  * the fields are filled in:
956  *
957  *     (root_key.objectid, inode objectid, offset in file, 1)
958  *
959  * When a file extent is removed file truncation, we find the
960  * corresponding implicit back refs and check the following fields:
961  *
962  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
963  *
964  * Btree extents can be referenced by:
965  *
966  * - Different subvolumes
967  *
968  * Both the implicit back refs and the full back refs for tree blocks
969  * only consist of key. The key offset for the implicit back refs is
970  * objectid of block's owner tree. The key offset for the full back refs
971  * is the first byte of parent block.
972  *
973  * When implicit back refs is used, information about the lowest key and
974  * level of the tree block are required. These information are stored in
975  * tree block info structure.
976  */
977
978 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
979 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
980                                   struct btrfs_root *root,
981                                   struct btrfs_path *path,
982                                   u64 owner, u32 extra_size)
983 {
984         struct btrfs_extent_item *item;
985         struct btrfs_extent_item_v0 *ei0;
986         struct btrfs_extent_ref_v0 *ref0;
987         struct btrfs_tree_block_info *bi;
988         struct extent_buffer *leaf;
989         struct btrfs_key key;
990         struct btrfs_key found_key;
991         u32 new_size = sizeof(*item);
992         u64 refs;
993         int ret;
994
995         leaf = path->nodes[0];
996         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
997
998         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
999         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1000                              struct btrfs_extent_item_v0);
1001         refs = btrfs_extent_refs_v0(leaf, ei0);
1002
1003         if (owner == (u64)-1) {
1004                 while (1) {
1005                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1006                                 ret = btrfs_next_leaf(root, path);
1007                                 if (ret < 0)
1008                                         return ret;
1009                                 BUG_ON(ret > 0); /* Corruption */
1010                                 leaf = path->nodes[0];
1011                         }
1012                         btrfs_item_key_to_cpu(leaf, &found_key,
1013                                               path->slots[0]);
1014                         BUG_ON(key.objectid != found_key.objectid);
1015                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1016                                 path->slots[0]++;
1017                                 continue;
1018                         }
1019                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1020                                               struct btrfs_extent_ref_v0);
1021                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1022                         break;
1023                 }
1024         }
1025         btrfs_release_path(path);
1026
1027         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1028                 new_size += sizeof(*bi);
1029
1030         new_size -= sizeof(*ei0);
1031         ret = btrfs_search_slot(trans, root, &key, path,
1032                                 new_size + extra_size, 1);
1033         if (ret < 0)
1034                 return ret;
1035         BUG_ON(ret); /* Corruption */
1036
1037         btrfs_extend_item(root, path, new_size);
1038
1039         leaf = path->nodes[0];
1040         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1041         btrfs_set_extent_refs(leaf, item, refs);
1042         /* FIXME: get real generation */
1043         btrfs_set_extent_generation(leaf, item, 0);
1044         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1045                 btrfs_set_extent_flags(leaf, item,
1046                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1047                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1048                 bi = (struct btrfs_tree_block_info *)(item + 1);
1049                 /* FIXME: get first key of the block */
1050                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1051                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1052         } else {
1053                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1054         }
1055         btrfs_mark_buffer_dirty(leaf);
1056         return 0;
1057 }
1058 #endif
1059
1060 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1061 {
1062         u32 high_crc = ~(u32)0;
1063         u32 low_crc = ~(u32)0;
1064         __le64 lenum;
1065
1066         lenum = cpu_to_le64(root_objectid);
1067         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1068         lenum = cpu_to_le64(owner);
1069         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1070         lenum = cpu_to_le64(offset);
1071         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1072
1073         return ((u64)high_crc << 31) ^ (u64)low_crc;
1074 }
1075
1076 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1077                                      struct btrfs_extent_data_ref *ref)
1078 {
1079         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1080                                     btrfs_extent_data_ref_objectid(leaf, ref),
1081                                     btrfs_extent_data_ref_offset(leaf, ref));
1082 }
1083
1084 static int match_extent_data_ref(struct extent_buffer *leaf,
1085                                  struct btrfs_extent_data_ref *ref,
1086                                  u64 root_objectid, u64 owner, u64 offset)
1087 {
1088         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1089             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1090             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1091                 return 0;
1092         return 1;
1093 }
1094
1095 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1096                                            struct btrfs_root *root,
1097                                            struct btrfs_path *path,
1098                                            u64 bytenr, u64 parent,
1099                                            u64 root_objectid,
1100                                            u64 owner, u64 offset)
1101 {
1102         struct btrfs_key key;
1103         struct btrfs_extent_data_ref *ref;
1104         struct extent_buffer *leaf;
1105         u32 nritems;
1106         int ret;
1107         int recow;
1108         int err = -ENOENT;
1109
1110         key.objectid = bytenr;
1111         if (parent) {
1112                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1113                 key.offset = parent;
1114         } else {
1115                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1116                 key.offset = hash_extent_data_ref(root_objectid,
1117                                                   owner, offset);
1118         }
1119 again:
1120         recow = 0;
1121         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1122         if (ret < 0) {
1123                 err = ret;
1124                 goto fail;
1125         }
1126
1127         if (parent) {
1128                 if (!ret)
1129                         return 0;
1130 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1131                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1132                 btrfs_release_path(path);
1133                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1134                 if (ret < 0) {
1135                         err = ret;
1136                         goto fail;
1137                 }
1138                 if (!ret)
1139                         return 0;
1140 #endif
1141                 goto fail;
1142         }
1143
1144         leaf = path->nodes[0];
1145         nritems = btrfs_header_nritems(leaf);
1146         while (1) {
1147                 if (path->slots[0] >= nritems) {
1148                         ret = btrfs_next_leaf(root, path);
1149                         if (ret < 0)
1150                                 err = ret;
1151                         if (ret)
1152                                 goto fail;
1153
1154                         leaf = path->nodes[0];
1155                         nritems = btrfs_header_nritems(leaf);
1156                         recow = 1;
1157                 }
1158
1159                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1160                 if (key.objectid != bytenr ||
1161                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1162                         goto fail;
1163
1164                 ref = btrfs_item_ptr(leaf, path->slots[0],
1165                                      struct btrfs_extent_data_ref);
1166
1167                 if (match_extent_data_ref(leaf, ref, root_objectid,
1168                                           owner, offset)) {
1169                         if (recow) {
1170                                 btrfs_release_path(path);
1171                                 goto again;
1172                         }
1173                         err = 0;
1174                         break;
1175                 }
1176                 path->slots[0]++;
1177         }
1178 fail:
1179         return err;
1180 }
1181
1182 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1183                                            struct btrfs_root *root,
1184                                            struct btrfs_path *path,
1185                                            u64 bytenr, u64 parent,
1186                                            u64 root_objectid, u64 owner,
1187                                            u64 offset, int refs_to_add)
1188 {
1189         struct btrfs_key key;
1190         struct extent_buffer *leaf;
1191         u32 size;
1192         u32 num_refs;
1193         int ret;
1194
1195         key.objectid = bytenr;
1196         if (parent) {
1197                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1198                 key.offset = parent;
1199                 size = sizeof(struct btrfs_shared_data_ref);
1200         } else {
1201                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1202                 key.offset = hash_extent_data_ref(root_objectid,
1203                                                   owner, offset);
1204                 size = sizeof(struct btrfs_extent_data_ref);
1205         }
1206
1207         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1208         if (ret && ret != -EEXIST)
1209                 goto fail;
1210
1211         leaf = path->nodes[0];
1212         if (parent) {
1213                 struct btrfs_shared_data_ref *ref;
1214                 ref = btrfs_item_ptr(leaf, path->slots[0],
1215                                      struct btrfs_shared_data_ref);
1216                 if (ret == 0) {
1217                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1218                 } else {
1219                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1220                         num_refs += refs_to_add;
1221                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1222                 }
1223         } else {
1224                 struct btrfs_extent_data_ref *ref;
1225                 while (ret == -EEXIST) {
1226                         ref = btrfs_item_ptr(leaf, path->slots[0],
1227                                              struct btrfs_extent_data_ref);
1228                         if (match_extent_data_ref(leaf, ref, root_objectid,
1229                                                   owner, offset))
1230                                 break;
1231                         btrfs_release_path(path);
1232                         key.offset++;
1233                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1234                                                       size);
1235                         if (ret && ret != -EEXIST)
1236                                 goto fail;
1237
1238                         leaf = path->nodes[0];
1239                 }
1240                 ref = btrfs_item_ptr(leaf, path->slots[0],
1241                                      struct btrfs_extent_data_ref);
1242                 if (ret == 0) {
1243                         btrfs_set_extent_data_ref_root(leaf, ref,
1244                                                        root_objectid);
1245                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1246                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1247                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1248                 } else {
1249                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1250                         num_refs += refs_to_add;
1251                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1252                 }
1253         }
1254         btrfs_mark_buffer_dirty(leaf);
1255         ret = 0;
1256 fail:
1257         btrfs_release_path(path);
1258         return ret;
1259 }
1260
1261 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1262                                            struct btrfs_root *root,
1263                                            struct btrfs_path *path,
1264                                            int refs_to_drop, int *last_ref)
1265 {
1266         struct btrfs_key key;
1267         struct btrfs_extent_data_ref *ref1 = NULL;
1268         struct btrfs_shared_data_ref *ref2 = NULL;
1269         struct extent_buffer *leaf;
1270         u32 num_refs = 0;
1271         int ret = 0;
1272
1273         leaf = path->nodes[0];
1274         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1275
1276         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1277                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1278                                       struct btrfs_extent_data_ref);
1279                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1280         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1281                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1282                                       struct btrfs_shared_data_ref);
1283                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1284 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1285         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1286                 struct btrfs_extent_ref_v0 *ref0;
1287                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1288                                       struct btrfs_extent_ref_v0);
1289                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1290 #endif
1291         } else {
1292                 BUG();
1293         }
1294
1295         BUG_ON(num_refs < refs_to_drop);
1296         num_refs -= refs_to_drop;
1297
1298         if (num_refs == 0) {
1299                 ret = btrfs_del_item(trans, root, path);
1300                 *last_ref = 1;
1301         } else {
1302                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1303                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1304                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1305                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1306 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1307                 else {
1308                         struct btrfs_extent_ref_v0 *ref0;
1309                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1310                                         struct btrfs_extent_ref_v0);
1311                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1312                 }
1313 #endif
1314                 btrfs_mark_buffer_dirty(leaf);
1315         }
1316         return ret;
1317 }
1318
1319 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1320                                           struct btrfs_path *path,
1321                                           struct btrfs_extent_inline_ref *iref)
1322 {
1323         struct btrfs_key key;
1324         struct extent_buffer *leaf;
1325         struct btrfs_extent_data_ref *ref1;
1326         struct btrfs_shared_data_ref *ref2;
1327         u32 num_refs = 0;
1328
1329         leaf = path->nodes[0];
1330         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1331         if (iref) {
1332                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1333                     BTRFS_EXTENT_DATA_REF_KEY) {
1334                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1335                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1336                 } else {
1337                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1338                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1339                 }
1340         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1341                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1342                                       struct btrfs_extent_data_ref);
1343                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1344         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1345                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1346                                       struct btrfs_shared_data_ref);
1347                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1348 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1349         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1350                 struct btrfs_extent_ref_v0 *ref0;
1351                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1352                                       struct btrfs_extent_ref_v0);
1353                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1354 #endif
1355         } else {
1356                 WARN_ON(1);
1357         }
1358         return num_refs;
1359 }
1360
1361 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1362                                           struct btrfs_root *root,
1363                                           struct btrfs_path *path,
1364                                           u64 bytenr, u64 parent,
1365                                           u64 root_objectid)
1366 {
1367         struct btrfs_key key;
1368         int ret;
1369
1370         key.objectid = bytenr;
1371         if (parent) {
1372                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1373                 key.offset = parent;
1374         } else {
1375                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1376                 key.offset = root_objectid;
1377         }
1378
1379         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1380         if (ret > 0)
1381                 ret = -ENOENT;
1382 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1383         if (ret == -ENOENT && parent) {
1384                 btrfs_release_path(path);
1385                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1386                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1387                 if (ret > 0)
1388                         ret = -ENOENT;
1389         }
1390 #endif
1391         return ret;
1392 }
1393
1394 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1395                                           struct btrfs_root *root,
1396                                           struct btrfs_path *path,
1397                                           u64 bytenr, u64 parent,
1398                                           u64 root_objectid)
1399 {
1400         struct btrfs_key key;
1401         int ret;
1402
1403         key.objectid = bytenr;
1404         if (parent) {
1405                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1406                 key.offset = parent;
1407         } else {
1408                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1409                 key.offset = root_objectid;
1410         }
1411
1412         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1413         btrfs_release_path(path);
1414         return ret;
1415 }
1416
1417 static inline int extent_ref_type(u64 parent, u64 owner)
1418 {
1419         int type;
1420         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1421                 if (parent > 0)
1422                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1423                 else
1424                         type = BTRFS_TREE_BLOCK_REF_KEY;
1425         } else {
1426                 if (parent > 0)
1427                         type = BTRFS_SHARED_DATA_REF_KEY;
1428                 else
1429                         type = BTRFS_EXTENT_DATA_REF_KEY;
1430         }
1431         return type;
1432 }
1433
1434 static int find_next_key(struct btrfs_path *path, int level,
1435                          struct btrfs_key *key)
1436
1437 {
1438         for (; level < BTRFS_MAX_LEVEL; level++) {
1439                 if (!path->nodes[level])
1440                         break;
1441                 if (path->slots[level] + 1 >=
1442                     btrfs_header_nritems(path->nodes[level]))
1443                         continue;
1444                 if (level == 0)
1445                         btrfs_item_key_to_cpu(path->nodes[level], key,
1446                                               path->slots[level] + 1);
1447                 else
1448                         btrfs_node_key_to_cpu(path->nodes[level], key,
1449                                               path->slots[level] + 1);
1450                 return 0;
1451         }
1452         return 1;
1453 }
1454
1455 /*
1456  * look for inline back ref. if back ref is found, *ref_ret is set
1457  * to the address of inline back ref, and 0 is returned.
1458  *
1459  * if back ref isn't found, *ref_ret is set to the address where it
1460  * should be inserted, and -ENOENT is returned.
1461  *
1462  * if insert is true and there are too many inline back refs, the path
1463  * points to the extent item, and -EAGAIN is returned.
1464  *
1465  * NOTE: inline back refs are ordered in the same way that back ref
1466  *       items in the tree are ordered.
1467  */
1468 static noinline_for_stack
1469 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1470                                  struct btrfs_root *root,
1471                                  struct btrfs_path *path,
1472                                  struct btrfs_extent_inline_ref **ref_ret,
1473                                  u64 bytenr, u64 num_bytes,
1474                                  u64 parent, u64 root_objectid,
1475                                  u64 owner, u64 offset, int insert)
1476 {
1477         struct btrfs_key key;
1478         struct extent_buffer *leaf;
1479         struct btrfs_extent_item *ei;
1480         struct btrfs_extent_inline_ref *iref;
1481         u64 flags;
1482         u64 item_size;
1483         unsigned long ptr;
1484         unsigned long end;
1485         int extra_size;
1486         int type;
1487         int want;
1488         int ret;
1489         int err = 0;
1490         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1491                                                  SKINNY_METADATA);
1492
1493         key.objectid = bytenr;
1494         key.type = BTRFS_EXTENT_ITEM_KEY;
1495         key.offset = num_bytes;
1496
1497         want = extent_ref_type(parent, owner);
1498         if (insert) {
1499                 extra_size = btrfs_extent_inline_ref_size(want);
1500                 path->keep_locks = 1;
1501         } else
1502                 extra_size = -1;
1503
1504         /*
1505          * Owner is our parent level, so we can just add one to get the level
1506          * for the block we are interested in.
1507          */
1508         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1509                 key.type = BTRFS_METADATA_ITEM_KEY;
1510                 key.offset = owner;
1511         }
1512
1513 again:
1514         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1515         if (ret < 0) {
1516                 err = ret;
1517                 goto out;
1518         }
1519
1520         /*
1521          * We may be a newly converted file system which still has the old fat
1522          * extent entries for metadata, so try and see if we have one of those.
1523          */
1524         if (ret > 0 && skinny_metadata) {
1525                 skinny_metadata = false;
1526                 if (path->slots[0]) {
1527                         path->slots[0]--;
1528                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1529                                               path->slots[0]);
1530                         if (key.objectid == bytenr &&
1531                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1532                             key.offset == num_bytes)
1533                                 ret = 0;
1534                 }
1535                 if (ret) {
1536                         key.objectid = bytenr;
1537                         key.type = BTRFS_EXTENT_ITEM_KEY;
1538                         key.offset = num_bytes;
1539                         btrfs_release_path(path);
1540                         goto again;
1541                 }
1542         }
1543
1544         if (ret && !insert) {
1545                 err = -ENOENT;
1546                 goto out;
1547         } else if (WARN_ON(ret)) {
1548                 err = -EIO;
1549                 goto out;
1550         }
1551
1552         leaf = path->nodes[0];
1553         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1554 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1555         if (item_size < sizeof(*ei)) {
1556                 if (!insert) {
1557                         err = -ENOENT;
1558                         goto out;
1559                 }
1560                 ret = convert_extent_item_v0(trans, root, path, owner,
1561                                              extra_size);
1562                 if (ret < 0) {
1563                         err = ret;
1564                         goto out;
1565                 }
1566                 leaf = path->nodes[0];
1567                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1568         }
1569 #endif
1570         BUG_ON(item_size < sizeof(*ei));
1571
1572         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1573         flags = btrfs_extent_flags(leaf, ei);
1574
1575         ptr = (unsigned long)(ei + 1);
1576         end = (unsigned long)ei + item_size;
1577
1578         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1579                 ptr += sizeof(struct btrfs_tree_block_info);
1580                 BUG_ON(ptr > end);
1581         }
1582
1583         err = -ENOENT;
1584         while (1) {
1585                 if (ptr >= end) {
1586                         WARN_ON(ptr > end);
1587                         break;
1588                 }
1589                 iref = (struct btrfs_extent_inline_ref *)ptr;
1590                 type = btrfs_extent_inline_ref_type(leaf, iref);
1591                 if (want < type)
1592                         break;
1593                 if (want > type) {
1594                         ptr += btrfs_extent_inline_ref_size(type);
1595                         continue;
1596                 }
1597
1598                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1599                         struct btrfs_extent_data_ref *dref;
1600                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1601                         if (match_extent_data_ref(leaf, dref, root_objectid,
1602                                                   owner, offset)) {
1603                                 err = 0;
1604                                 break;
1605                         }
1606                         if (hash_extent_data_ref_item(leaf, dref) <
1607                             hash_extent_data_ref(root_objectid, owner, offset))
1608                                 break;
1609                 } else {
1610                         u64 ref_offset;
1611                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1612                         if (parent > 0) {
1613                                 if (parent == ref_offset) {
1614                                         err = 0;
1615                                         break;
1616                                 }
1617                                 if (ref_offset < parent)
1618                                         break;
1619                         } else {
1620                                 if (root_objectid == ref_offset) {
1621                                         err = 0;
1622                                         break;
1623                                 }
1624                                 if (ref_offset < root_objectid)
1625                                         break;
1626                         }
1627                 }
1628                 ptr += btrfs_extent_inline_ref_size(type);
1629         }
1630         if (err == -ENOENT && insert) {
1631                 if (item_size + extra_size >=
1632                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1633                         err = -EAGAIN;
1634                         goto out;
1635                 }
1636                 /*
1637                  * To add new inline back ref, we have to make sure
1638                  * there is no corresponding back ref item.
1639                  * For simplicity, we just do not add new inline back
1640                  * ref if there is any kind of item for this block
1641                  */
1642                 if (find_next_key(path, 0, &key) == 0 &&
1643                     key.objectid == bytenr &&
1644                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1645                         err = -EAGAIN;
1646                         goto out;
1647                 }
1648         }
1649         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1650 out:
1651         if (insert) {
1652                 path->keep_locks = 0;
1653                 btrfs_unlock_up_safe(path, 1);
1654         }
1655         return err;
1656 }
1657
1658 /*
1659  * helper to add new inline back ref
1660  */
1661 static noinline_for_stack
1662 void setup_inline_extent_backref(struct btrfs_root *root,
1663                                  struct btrfs_path *path,
1664                                  struct btrfs_extent_inline_ref *iref,
1665                                  u64 parent, u64 root_objectid,
1666                                  u64 owner, u64 offset, int refs_to_add,
1667                                  struct btrfs_delayed_extent_op *extent_op)
1668 {
1669         struct extent_buffer *leaf;
1670         struct btrfs_extent_item *ei;
1671         unsigned long ptr;
1672         unsigned long end;
1673         unsigned long item_offset;
1674         u64 refs;
1675         int size;
1676         int type;
1677
1678         leaf = path->nodes[0];
1679         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1680         item_offset = (unsigned long)iref - (unsigned long)ei;
1681
1682         type = extent_ref_type(parent, owner);
1683         size = btrfs_extent_inline_ref_size(type);
1684
1685         btrfs_extend_item(root, path, size);
1686
1687         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1688         refs = btrfs_extent_refs(leaf, ei);
1689         refs += refs_to_add;
1690         btrfs_set_extent_refs(leaf, ei, refs);
1691         if (extent_op)
1692                 __run_delayed_extent_op(extent_op, leaf, ei);
1693
1694         ptr = (unsigned long)ei + item_offset;
1695         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1696         if (ptr < end - size)
1697                 memmove_extent_buffer(leaf, ptr + size, ptr,
1698                                       end - size - ptr);
1699
1700         iref = (struct btrfs_extent_inline_ref *)ptr;
1701         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1702         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1703                 struct btrfs_extent_data_ref *dref;
1704                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1705                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1706                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1707                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1708                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1709         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1710                 struct btrfs_shared_data_ref *sref;
1711                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1712                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1713                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1714         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1715                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1716         } else {
1717                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1718         }
1719         btrfs_mark_buffer_dirty(leaf);
1720 }
1721
1722 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1723                                  struct btrfs_root *root,
1724                                  struct btrfs_path *path,
1725                                  struct btrfs_extent_inline_ref **ref_ret,
1726                                  u64 bytenr, u64 num_bytes, u64 parent,
1727                                  u64 root_objectid, u64 owner, u64 offset)
1728 {
1729         int ret;
1730
1731         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1732                                            bytenr, num_bytes, parent,
1733                                            root_objectid, owner, offset, 0);
1734         if (ret != -ENOENT)
1735                 return ret;
1736
1737         btrfs_release_path(path);
1738         *ref_ret = NULL;
1739
1740         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1741                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1742                                             root_objectid);
1743         } else {
1744                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1745                                              root_objectid, owner, offset);
1746         }
1747         return ret;
1748 }
1749
1750 /*
1751  * helper to update/remove inline back ref
1752  */
1753 static noinline_for_stack
1754 void update_inline_extent_backref(struct btrfs_root *root,
1755                                   struct btrfs_path *path,
1756                                   struct btrfs_extent_inline_ref *iref,
1757                                   int refs_to_mod,
1758                                   struct btrfs_delayed_extent_op *extent_op,
1759                                   int *last_ref)
1760 {
1761         struct extent_buffer *leaf;
1762         struct btrfs_extent_item *ei;
1763         struct btrfs_extent_data_ref *dref = NULL;
1764         struct btrfs_shared_data_ref *sref = NULL;
1765         unsigned long ptr;
1766         unsigned long end;
1767         u32 item_size;
1768         int size;
1769         int type;
1770         u64 refs;
1771
1772         leaf = path->nodes[0];
1773         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1774         refs = btrfs_extent_refs(leaf, ei);
1775         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1776         refs += refs_to_mod;
1777         btrfs_set_extent_refs(leaf, ei, refs);
1778         if (extent_op)
1779                 __run_delayed_extent_op(extent_op, leaf, ei);
1780
1781         type = btrfs_extent_inline_ref_type(leaf, iref);
1782
1783         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1784                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1785                 refs = btrfs_extent_data_ref_count(leaf, dref);
1786         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1787                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1788                 refs = btrfs_shared_data_ref_count(leaf, sref);
1789         } else {
1790                 refs = 1;
1791                 BUG_ON(refs_to_mod != -1);
1792         }
1793
1794         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1795         refs += refs_to_mod;
1796
1797         if (refs > 0) {
1798                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1799                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1800                 else
1801                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1802         } else {
1803                 *last_ref = 1;
1804                 size =  btrfs_extent_inline_ref_size(type);
1805                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1806                 ptr = (unsigned long)iref;
1807                 end = (unsigned long)ei + item_size;
1808                 if (ptr + size < end)
1809                         memmove_extent_buffer(leaf, ptr, ptr + size,
1810                                               end - ptr - size);
1811                 item_size -= size;
1812                 btrfs_truncate_item(root, path, item_size, 1);
1813         }
1814         btrfs_mark_buffer_dirty(leaf);
1815 }
1816
1817 static noinline_for_stack
1818 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1819                                  struct btrfs_root *root,
1820                                  struct btrfs_path *path,
1821                                  u64 bytenr, u64 num_bytes, u64 parent,
1822                                  u64 root_objectid, u64 owner,
1823                                  u64 offset, int refs_to_add,
1824                                  struct btrfs_delayed_extent_op *extent_op)
1825 {
1826         struct btrfs_extent_inline_ref *iref;
1827         int ret;
1828
1829         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1830                                            bytenr, num_bytes, parent,
1831                                            root_objectid, owner, offset, 1);
1832         if (ret == 0) {
1833                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1834                 update_inline_extent_backref(root, path, iref,
1835                                              refs_to_add, extent_op, NULL);
1836         } else if (ret == -ENOENT) {
1837                 setup_inline_extent_backref(root, path, iref, parent,
1838                                             root_objectid, owner, offset,
1839                                             refs_to_add, extent_op);
1840                 ret = 0;
1841         }
1842         return ret;
1843 }
1844
1845 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1846                                  struct btrfs_root *root,
1847                                  struct btrfs_path *path,
1848                                  u64 bytenr, u64 parent, u64 root_objectid,
1849                                  u64 owner, u64 offset, int refs_to_add)
1850 {
1851         int ret;
1852         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1853                 BUG_ON(refs_to_add != 1);
1854                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1855                                             parent, root_objectid);
1856         } else {
1857                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1858                                              parent, root_objectid,
1859                                              owner, offset, refs_to_add);
1860         }
1861         return ret;
1862 }
1863
1864 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1865                                  struct btrfs_root *root,
1866                                  struct btrfs_path *path,
1867                                  struct btrfs_extent_inline_ref *iref,
1868                                  int refs_to_drop, int is_data, int *last_ref)
1869 {
1870         int ret = 0;
1871
1872         BUG_ON(!is_data && refs_to_drop != 1);
1873         if (iref) {
1874                 update_inline_extent_backref(root, path, iref,
1875                                              -refs_to_drop, NULL, last_ref);
1876         } else if (is_data) {
1877                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1878                                              last_ref);
1879         } else {
1880                 *last_ref = 1;
1881                 ret = btrfs_del_item(trans, root, path);
1882         }
1883         return ret;
1884 }
1885
1886 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1887 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1888                                u64 *discarded_bytes)
1889 {
1890         int j, ret = 0;
1891         u64 bytes_left, end;
1892         u64 aligned_start = ALIGN(start, 1 << 9);
1893
1894         if (WARN_ON(start != aligned_start)) {
1895                 len -= aligned_start - start;
1896                 len = round_down(len, 1 << 9);
1897                 start = aligned_start;
1898         }
1899
1900         *discarded_bytes = 0;
1901
1902         if (!len)
1903                 return 0;
1904
1905         end = start + len;
1906         bytes_left = len;
1907
1908         /* Skip any superblocks on this device. */
1909         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1910                 u64 sb_start = btrfs_sb_offset(j);
1911                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1912                 u64 size = sb_start - start;
1913
1914                 if (!in_range(sb_start, start, bytes_left) &&
1915                     !in_range(sb_end, start, bytes_left) &&
1916                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1917                         continue;
1918
1919                 /*
1920                  * Superblock spans beginning of range.  Adjust start and
1921                  * try again.
1922                  */
1923                 if (sb_start <= start) {
1924                         start += sb_end - start;
1925                         if (start > end) {
1926                                 bytes_left = 0;
1927                                 break;
1928                         }
1929                         bytes_left = end - start;
1930                         continue;
1931                 }
1932
1933                 if (size) {
1934                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1935                                                    GFP_NOFS, 0);
1936                         if (!ret)
1937                                 *discarded_bytes += size;
1938                         else if (ret != -EOPNOTSUPP)
1939                                 return ret;
1940                 }
1941
1942                 start = sb_end;
1943                 if (start > end) {
1944                         bytes_left = 0;
1945                         break;
1946                 }
1947                 bytes_left = end - start;
1948         }
1949
1950         if (bytes_left) {
1951                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1952                                            GFP_NOFS, 0);
1953                 if (!ret)
1954                         *discarded_bytes += bytes_left;
1955         }
1956         return ret;
1957 }
1958
1959 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1960                          u64 num_bytes, u64 *actual_bytes)
1961 {
1962         int ret;
1963         u64 discarded_bytes = 0;
1964         struct btrfs_bio *bbio = NULL;
1965
1966
1967         /* Tell the block device(s) that the sectors can be discarded */
1968         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1969                               bytenr, &num_bytes, &bbio, 0);
1970         /* Error condition is -ENOMEM */
1971         if (!ret) {
1972                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1973                 int i;
1974
1975
1976                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1977                         u64 bytes;
1978                         if (!stripe->dev->can_discard)
1979                                 continue;
1980
1981                         ret = btrfs_issue_discard(stripe->dev->bdev,
1982                                                   stripe->physical,
1983                                                   stripe->length,
1984                                                   &bytes);
1985                         if (!ret)
1986                                 discarded_bytes += bytes;
1987                         else if (ret != -EOPNOTSUPP)
1988                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1989
1990                         /*
1991                          * Just in case we get back EOPNOTSUPP for some reason,
1992                          * just ignore the return value so we don't screw up
1993                          * people calling discard_extent.
1994                          */
1995                         ret = 0;
1996                 }
1997                 btrfs_put_bbio(bbio);
1998         }
1999
2000         if (actual_bytes)
2001                 *actual_bytes = discarded_bytes;
2002
2003
2004         if (ret == -EOPNOTSUPP)
2005                 ret = 0;
2006         return ret;
2007 }
2008
2009 /* Can return -ENOMEM */
2010 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2011                          struct btrfs_root *root,
2012                          u64 bytenr, u64 num_bytes, u64 parent,
2013                          u64 root_objectid, u64 owner, u64 offset,
2014                          int no_quota)
2015 {
2016         int ret;
2017         struct btrfs_fs_info *fs_info = root->fs_info;
2018
2019         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2020                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2021
2022         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2023                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2024                                         num_bytes,
2025                                         parent, root_objectid, (int)owner,
2026                                         BTRFS_ADD_DELAYED_REF, NULL, no_quota);
2027         } else {
2028                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2029                                         num_bytes,
2030                                         parent, root_objectid, owner, offset,
2031                                         BTRFS_ADD_DELAYED_REF, NULL, no_quota);
2032         }
2033         return ret;
2034 }
2035
2036 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2037                                   struct btrfs_root *root,
2038                                   struct btrfs_delayed_ref_node *node,
2039                                   u64 parent, u64 root_objectid,
2040                                   u64 owner, u64 offset, int refs_to_add,
2041                                   struct btrfs_delayed_extent_op *extent_op)
2042 {
2043         struct btrfs_fs_info *fs_info = root->fs_info;
2044         struct btrfs_path *path;
2045         struct extent_buffer *leaf;
2046         struct btrfs_extent_item *item;
2047         struct btrfs_key key;
2048         u64 bytenr = node->bytenr;
2049         u64 num_bytes = node->num_bytes;
2050         u64 refs;
2051         int ret;
2052         int no_quota = node->no_quota;
2053
2054         path = btrfs_alloc_path();
2055         if (!path)
2056                 return -ENOMEM;
2057
2058         if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
2059                 no_quota = 1;
2060
2061         path->reada = 1;
2062         path->leave_spinning = 1;
2063         /* this will setup the path even if it fails to insert the back ref */
2064         ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2065                                            bytenr, num_bytes, parent,
2066                                            root_objectid, owner, offset,
2067                                            refs_to_add, extent_op);
2068         if ((ret < 0 && ret != -EAGAIN) || !ret)
2069                 goto out;
2070
2071         /*
2072          * Ok we had -EAGAIN which means we didn't have space to insert and
2073          * inline extent ref, so just update the reference count and add a
2074          * normal backref.
2075          */
2076         leaf = path->nodes[0];
2077         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2078         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2079         refs = btrfs_extent_refs(leaf, item);
2080         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2081         if (extent_op)
2082                 __run_delayed_extent_op(extent_op, leaf, item);
2083
2084         btrfs_mark_buffer_dirty(leaf);
2085         btrfs_release_path(path);
2086
2087         path->reada = 1;
2088         path->leave_spinning = 1;
2089         /* now insert the actual backref */
2090         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2091                                     path, bytenr, parent, root_objectid,
2092                                     owner, offset, refs_to_add);
2093         if (ret)
2094                 btrfs_abort_transaction(trans, root, ret);
2095 out:
2096         btrfs_free_path(path);
2097         return ret;
2098 }
2099
2100 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2101                                 struct btrfs_root *root,
2102                                 struct btrfs_delayed_ref_node *node,
2103                                 struct btrfs_delayed_extent_op *extent_op,
2104                                 int insert_reserved)
2105 {
2106         int ret = 0;
2107         struct btrfs_delayed_data_ref *ref;
2108         struct btrfs_key ins;
2109         u64 parent = 0;
2110         u64 ref_root = 0;
2111         u64 flags = 0;
2112
2113         ins.objectid = node->bytenr;
2114         ins.offset = node->num_bytes;
2115         ins.type = BTRFS_EXTENT_ITEM_KEY;
2116
2117         ref = btrfs_delayed_node_to_data_ref(node);
2118         trace_run_delayed_data_ref(node, ref, node->action);
2119
2120         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2121                 parent = ref->parent;
2122         ref_root = ref->root;
2123
2124         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2125                 if (extent_op)
2126                         flags |= extent_op->flags_to_set;
2127                 ret = alloc_reserved_file_extent(trans, root,
2128                                                  parent, ref_root, flags,
2129                                                  ref->objectid, ref->offset,
2130                                                  &ins, node->ref_mod);
2131         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2132                 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2133                                              ref_root, ref->objectid,
2134                                              ref->offset, node->ref_mod,
2135                                              extent_op);
2136         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2137                 ret = __btrfs_free_extent(trans, root, node, parent,
2138                                           ref_root, ref->objectid,
2139                                           ref->offset, node->ref_mod,
2140                                           extent_op);
2141         } else {
2142                 BUG();
2143         }
2144         return ret;
2145 }
2146
2147 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2148                                     struct extent_buffer *leaf,
2149                                     struct btrfs_extent_item *ei)
2150 {
2151         u64 flags = btrfs_extent_flags(leaf, ei);
2152         if (extent_op->update_flags) {
2153                 flags |= extent_op->flags_to_set;
2154                 btrfs_set_extent_flags(leaf, ei, flags);
2155         }
2156
2157         if (extent_op->update_key) {
2158                 struct btrfs_tree_block_info *bi;
2159                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2160                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2161                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2162         }
2163 }
2164
2165 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2166                                  struct btrfs_root *root,
2167                                  struct btrfs_delayed_ref_node *node,
2168                                  struct btrfs_delayed_extent_op *extent_op)
2169 {
2170         struct btrfs_key key;
2171         struct btrfs_path *path;
2172         struct btrfs_extent_item *ei;
2173         struct extent_buffer *leaf;
2174         u32 item_size;
2175         int ret;
2176         int err = 0;
2177         int metadata = !extent_op->is_data;
2178
2179         if (trans->aborted)
2180                 return 0;
2181
2182         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2183                 metadata = 0;
2184
2185         path = btrfs_alloc_path();
2186         if (!path)
2187                 return -ENOMEM;
2188
2189         key.objectid = node->bytenr;
2190
2191         if (metadata) {
2192                 key.type = BTRFS_METADATA_ITEM_KEY;
2193                 key.offset = extent_op->level;
2194         } else {
2195                 key.type = BTRFS_EXTENT_ITEM_KEY;
2196                 key.offset = node->num_bytes;
2197         }
2198
2199 again:
2200         path->reada = 1;
2201         path->leave_spinning = 1;
2202         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2203                                 path, 0, 1);
2204         if (ret < 0) {
2205                 err = ret;
2206                 goto out;
2207         }
2208         if (ret > 0) {
2209                 if (metadata) {
2210                         if (path->slots[0] > 0) {
2211                                 path->slots[0]--;
2212                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2213                                                       path->slots[0]);
2214                                 if (key.objectid == node->bytenr &&
2215                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2216                                     key.offset == node->num_bytes)
2217                                         ret = 0;
2218                         }
2219                         if (ret > 0) {
2220                                 btrfs_release_path(path);
2221                                 metadata = 0;
2222
2223                                 key.objectid = node->bytenr;
2224                                 key.offset = node->num_bytes;
2225                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2226                                 goto again;
2227                         }
2228                 } else {
2229                         err = -EIO;
2230                         goto out;
2231                 }
2232         }
2233
2234         leaf = path->nodes[0];
2235         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2236 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2237         if (item_size < sizeof(*ei)) {
2238                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2239                                              path, (u64)-1, 0);
2240                 if (ret < 0) {
2241                         err = ret;
2242                         goto out;
2243                 }
2244                 leaf = path->nodes[0];
2245                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2246         }
2247 #endif
2248         BUG_ON(item_size < sizeof(*ei));
2249         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2250         __run_delayed_extent_op(extent_op, leaf, ei);
2251
2252         btrfs_mark_buffer_dirty(leaf);
2253 out:
2254         btrfs_free_path(path);
2255         return err;
2256 }
2257
2258 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2259                                 struct btrfs_root *root,
2260                                 struct btrfs_delayed_ref_node *node,
2261                                 struct btrfs_delayed_extent_op *extent_op,
2262                                 int insert_reserved)
2263 {
2264         int ret = 0;
2265         struct btrfs_delayed_tree_ref *ref;
2266         struct btrfs_key ins;
2267         u64 parent = 0;
2268         u64 ref_root = 0;
2269         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2270                                                  SKINNY_METADATA);
2271
2272         ref = btrfs_delayed_node_to_tree_ref(node);
2273         trace_run_delayed_tree_ref(node, ref, node->action);
2274
2275         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2276                 parent = ref->parent;
2277         ref_root = ref->root;
2278
2279         ins.objectid = node->bytenr;
2280         if (skinny_metadata) {
2281                 ins.offset = ref->level;
2282                 ins.type = BTRFS_METADATA_ITEM_KEY;
2283         } else {
2284                 ins.offset = node->num_bytes;
2285                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2286         }
2287
2288         BUG_ON(node->ref_mod != 1);
2289         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2290                 BUG_ON(!extent_op || !extent_op->update_flags);
2291                 ret = alloc_reserved_tree_block(trans, root,
2292                                                 parent, ref_root,
2293                                                 extent_op->flags_to_set,
2294                                                 &extent_op->key,
2295                                                 ref->level, &ins,
2296                                                 node->no_quota);
2297         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2298                 ret = __btrfs_inc_extent_ref(trans, root, node,
2299                                              parent, ref_root,
2300                                              ref->level, 0, 1,
2301                                              extent_op);
2302         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2303                 ret = __btrfs_free_extent(trans, root, node,
2304                                           parent, ref_root,
2305                                           ref->level, 0, 1, extent_op);
2306         } else {
2307                 BUG();
2308         }
2309         return ret;
2310 }
2311
2312 /* helper function to actually process a single delayed ref entry */
2313 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2314                                struct btrfs_root *root,
2315                                struct btrfs_delayed_ref_node *node,
2316                                struct btrfs_delayed_extent_op *extent_op,
2317                                int insert_reserved)
2318 {
2319         int ret = 0;
2320
2321         if (trans->aborted) {
2322                 if (insert_reserved)
2323                         btrfs_pin_extent(root, node->bytenr,
2324                                          node->num_bytes, 1);
2325                 return 0;
2326         }
2327
2328         if (btrfs_delayed_ref_is_head(node)) {
2329                 struct btrfs_delayed_ref_head *head;
2330                 /*
2331                  * we've hit the end of the chain and we were supposed
2332                  * to insert this extent into the tree.  But, it got
2333                  * deleted before we ever needed to insert it, so all
2334                  * we have to do is clean up the accounting
2335                  */
2336                 BUG_ON(extent_op);
2337                 head = btrfs_delayed_node_to_head(node);
2338                 trace_run_delayed_ref_head(node, head, node->action);
2339
2340                 if (insert_reserved) {
2341                         btrfs_pin_extent(root, node->bytenr,
2342                                          node->num_bytes, 1);
2343                         if (head->is_data) {
2344                                 ret = btrfs_del_csums(trans, root,
2345                                                       node->bytenr,
2346                                                       node->num_bytes);
2347                         }
2348                 }
2349                 return ret;
2350         }
2351
2352         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2353             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2354                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2355                                            insert_reserved);
2356         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2357                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2358                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2359                                            insert_reserved);
2360         else
2361                 BUG();
2362         return ret;
2363 }
2364
2365 static inline struct btrfs_delayed_ref_node *
2366 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2367 {
2368         struct btrfs_delayed_ref_node *ref;
2369
2370         if (list_empty(&head->ref_list))
2371                 return NULL;
2372
2373         /*
2374          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2375          * This is to prevent a ref count from going down to zero, which deletes
2376          * the extent item from the extent tree, when there still are references
2377          * to add, which would fail because they would not find the extent item.
2378          */
2379         list_for_each_entry(ref, &head->ref_list, list) {
2380                 if (ref->action == BTRFS_ADD_DELAYED_REF)
2381                         return ref;
2382         }
2383
2384         return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2385                           list);
2386 }
2387
2388 /*
2389  * Returns 0 on success or if called with an already aborted transaction.
2390  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2391  */
2392 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2393                                              struct btrfs_root *root,
2394                                              unsigned long nr)
2395 {
2396         struct btrfs_delayed_ref_root *delayed_refs;
2397         struct btrfs_delayed_ref_node *ref;
2398         struct btrfs_delayed_ref_head *locked_ref = NULL;
2399         struct btrfs_delayed_extent_op *extent_op;
2400         struct btrfs_fs_info *fs_info = root->fs_info;
2401         ktime_t start = ktime_get();
2402         int ret;
2403         unsigned long count = 0;
2404         unsigned long actual_count = 0;
2405         int must_insert_reserved = 0;
2406
2407         delayed_refs = &trans->transaction->delayed_refs;
2408         while (1) {
2409                 if (!locked_ref) {
2410                         if (count >= nr)
2411                                 break;
2412
2413                         spin_lock(&delayed_refs->lock);
2414                         locked_ref = btrfs_select_ref_head(trans);
2415                         if (!locked_ref) {
2416                                 spin_unlock(&delayed_refs->lock);
2417                                 break;
2418                         }
2419
2420                         /* grab the lock that says we are going to process
2421                          * all the refs for this head */
2422                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2423                         spin_unlock(&delayed_refs->lock);
2424                         /*
2425                          * we may have dropped the spin lock to get the head
2426                          * mutex lock, and that might have given someone else
2427                          * time to free the head.  If that's true, it has been
2428                          * removed from our list and we can move on.
2429                          */
2430                         if (ret == -EAGAIN) {
2431                                 locked_ref = NULL;
2432                                 count++;
2433                                 continue;
2434                         }
2435                 }
2436
2437                 spin_lock(&locked_ref->lock);
2438
2439                 /*
2440                  * locked_ref is the head node, so we have to go one
2441                  * node back for any delayed ref updates
2442                  */
2443                 ref = select_delayed_ref(locked_ref);
2444
2445                 if (ref && ref->seq &&
2446                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2447                         spin_unlock(&locked_ref->lock);
2448                         btrfs_delayed_ref_unlock(locked_ref);
2449                         spin_lock(&delayed_refs->lock);
2450                         locked_ref->processing = 0;
2451                         delayed_refs->num_heads_ready++;
2452                         spin_unlock(&delayed_refs->lock);
2453                         locked_ref = NULL;
2454                         cond_resched();
2455                         count++;
2456                         continue;
2457                 }
2458
2459                 /*
2460                  * record the must insert reserved flag before we
2461                  * drop the spin lock.
2462                  */
2463                 must_insert_reserved = locked_ref->must_insert_reserved;
2464                 locked_ref->must_insert_reserved = 0;
2465
2466                 extent_op = locked_ref->extent_op;
2467                 locked_ref->extent_op = NULL;
2468
2469                 if (!ref) {
2470
2471
2472                         /* All delayed refs have been processed, Go ahead
2473                          * and send the head node to run_one_delayed_ref,
2474                          * so that any accounting fixes can happen
2475                          */
2476                         ref = &locked_ref->node;
2477
2478                         if (extent_op && must_insert_reserved) {
2479                                 btrfs_free_delayed_extent_op(extent_op);
2480                                 extent_op = NULL;
2481                         }
2482
2483                         if (extent_op) {
2484                                 spin_unlock(&locked_ref->lock);
2485                                 ret = run_delayed_extent_op(trans, root,
2486                                                             ref, extent_op);
2487                                 btrfs_free_delayed_extent_op(extent_op);
2488
2489                                 if (ret) {
2490                                         /*
2491                                          * Need to reset must_insert_reserved if
2492                                          * there was an error so the abort stuff
2493                                          * can cleanup the reserved space
2494                                          * properly.
2495                                          */
2496                                         if (must_insert_reserved)
2497                                                 locked_ref->must_insert_reserved = 1;
2498                                         locked_ref->processing = 0;
2499                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2500                                         btrfs_delayed_ref_unlock(locked_ref);
2501                                         return ret;
2502                                 }
2503                                 continue;
2504                         }
2505
2506                         /*
2507                          * Need to drop our head ref lock and re-aqcuire the
2508                          * delayed ref lock and then re-check to make sure
2509                          * nobody got added.
2510                          */
2511                         spin_unlock(&locked_ref->lock);
2512                         spin_lock(&delayed_refs->lock);
2513                         spin_lock(&locked_ref->lock);
2514                         if (!list_empty(&locked_ref->ref_list) ||
2515                             locked_ref->extent_op) {
2516                                 spin_unlock(&locked_ref->lock);
2517                                 spin_unlock(&delayed_refs->lock);
2518                                 continue;
2519                         }
2520                         ref->in_tree = 0;
2521                         delayed_refs->num_heads--;
2522                         rb_erase(&locked_ref->href_node,
2523                                  &delayed_refs->href_root);
2524                         spin_unlock(&delayed_refs->lock);
2525                 } else {
2526                         actual_count++;
2527                         ref->in_tree = 0;
2528                         list_del(&ref->list);
2529                 }
2530                 atomic_dec(&delayed_refs->num_entries);
2531
2532                 if (!btrfs_delayed_ref_is_head(ref)) {
2533                         /*
2534                          * when we play the delayed ref, also correct the
2535                          * ref_mod on head
2536                          */
2537                         switch (ref->action) {
2538                         case BTRFS_ADD_DELAYED_REF:
2539                         case BTRFS_ADD_DELAYED_EXTENT:
2540                                 locked_ref->node.ref_mod -= ref->ref_mod;
2541                                 break;
2542                         case BTRFS_DROP_DELAYED_REF:
2543                                 locked_ref->node.ref_mod += ref->ref_mod;
2544                                 break;
2545                         default:
2546                                 WARN_ON(1);
2547                         }
2548                 }
2549                 spin_unlock(&locked_ref->lock);
2550
2551                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2552                                           must_insert_reserved);
2553
2554                 btrfs_free_delayed_extent_op(extent_op);
2555                 if (ret) {
2556                         locked_ref->processing = 0;
2557                         btrfs_delayed_ref_unlock(locked_ref);
2558                         btrfs_put_delayed_ref(ref);
2559                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2560                         return ret;
2561                 }
2562
2563                 /*
2564                  * If this node is a head, that means all the refs in this head
2565                  * have been dealt with, and we will pick the next head to deal
2566                  * with, so we must unlock the head and drop it from the cluster
2567                  * list before we release it.
2568                  */
2569                 if (btrfs_delayed_ref_is_head(ref)) {
2570                         if (locked_ref->is_data &&
2571                             locked_ref->total_ref_mod < 0) {
2572                                 spin_lock(&delayed_refs->lock);
2573                                 delayed_refs->pending_csums -= ref->num_bytes;
2574                                 spin_unlock(&delayed_refs->lock);
2575                         }
2576                         btrfs_delayed_ref_unlock(locked_ref);
2577                         locked_ref = NULL;
2578                 }
2579                 btrfs_put_delayed_ref(ref);
2580                 count++;
2581                 cond_resched();
2582         }
2583
2584         /*
2585          * We don't want to include ref heads since we can have empty ref heads
2586          * and those will drastically skew our runtime down since we just do
2587          * accounting, no actual extent tree updates.
2588          */
2589         if (actual_count > 0) {
2590                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2591                 u64 avg;
2592
2593                 /*
2594                  * We weigh the current average higher than our current runtime
2595                  * to avoid large swings in the average.
2596                  */
2597                 spin_lock(&delayed_refs->lock);
2598                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2599                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2600                 spin_unlock(&delayed_refs->lock);
2601         }
2602         return 0;
2603 }
2604
2605 #ifdef SCRAMBLE_DELAYED_REFS
2606 /*
2607  * Normally delayed refs get processed in ascending bytenr order. This
2608  * correlates in most cases to the order added. To expose dependencies on this
2609  * order, we start to process the tree in the middle instead of the beginning
2610  */
2611 static u64 find_middle(struct rb_root *root)
2612 {
2613         struct rb_node *n = root->rb_node;
2614         struct btrfs_delayed_ref_node *entry;
2615         int alt = 1;
2616         u64 middle;
2617         u64 first = 0, last = 0;
2618
2619         n = rb_first(root);
2620         if (n) {
2621                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2622                 first = entry->bytenr;
2623         }
2624         n = rb_last(root);
2625         if (n) {
2626                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2627                 last = entry->bytenr;
2628         }
2629         n = root->rb_node;
2630
2631         while (n) {
2632                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2633                 WARN_ON(!entry->in_tree);
2634
2635                 middle = entry->bytenr;
2636
2637                 if (alt)
2638                         n = n->rb_left;
2639                 else
2640                         n = n->rb_right;
2641
2642                 alt = 1 - alt;
2643         }
2644         return middle;
2645 }
2646 #endif
2647
2648 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2649 {
2650         u64 num_bytes;
2651
2652         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2653                              sizeof(struct btrfs_extent_inline_ref));
2654         if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2655                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2656
2657         /*
2658          * We don't ever fill up leaves all the way so multiply by 2 just to be
2659          * closer to what we're really going to want to ouse.
2660          */
2661         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2662 }
2663
2664 /*
2665  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2666  * would require to store the csums for that many bytes.
2667  */
2668 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2669 {
2670         u64 csum_size;
2671         u64 num_csums_per_leaf;
2672         u64 num_csums;
2673
2674         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
2675         num_csums_per_leaf = div64_u64(csum_size,
2676                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2677         num_csums = div64_u64(csum_bytes, root->sectorsize);
2678         num_csums += num_csums_per_leaf - 1;
2679         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2680         return num_csums;
2681 }
2682
2683 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2684                                        struct btrfs_root *root)
2685 {
2686         struct btrfs_block_rsv *global_rsv;
2687         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2688         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2689         u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2690         u64 num_bytes, num_dirty_bgs_bytes;
2691         int ret = 0;
2692
2693         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2694         num_heads = heads_to_leaves(root, num_heads);
2695         if (num_heads > 1)
2696                 num_bytes += (num_heads - 1) * root->nodesize;
2697         num_bytes <<= 1;
2698         num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2699         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2700                                                              num_dirty_bgs);
2701         global_rsv = &root->fs_info->global_block_rsv;
2702
2703         /*
2704          * If we can't allocate any more chunks lets make sure we have _lots_ of
2705          * wiggle room since running delayed refs can create more delayed refs.
2706          */
2707         if (global_rsv->space_info->full) {
2708                 num_dirty_bgs_bytes <<= 1;
2709                 num_bytes <<= 1;
2710         }
2711
2712         spin_lock(&global_rsv->lock);
2713         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2714                 ret = 1;
2715         spin_unlock(&global_rsv->lock);
2716         return ret;
2717 }
2718
2719 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2720                                        struct btrfs_root *root)
2721 {
2722         struct btrfs_fs_info *fs_info = root->fs_info;
2723         u64 num_entries =
2724                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2725         u64 avg_runtime;
2726         u64 val;
2727
2728         smp_mb();
2729         avg_runtime = fs_info->avg_delayed_ref_runtime;
2730         val = num_entries * avg_runtime;
2731         if (num_entries * avg_runtime >= NSEC_PER_SEC)
2732                 return 1;
2733         if (val >= NSEC_PER_SEC / 2)
2734                 return 2;
2735
2736         return btrfs_check_space_for_delayed_refs(trans, root);
2737 }
2738
2739 struct async_delayed_refs {
2740         struct btrfs_root *root;
2741         int count;
2742         int error;
2743         int sync;
2744         struct completion wait;
2745         struct btrfs_work work;
2746 };
2747
2748 static void delayed_ref_async_start(struct btrfs_work *work)
2749 {
2750         struct async_delayed_refs *async;
2751         struct btrfs_trans_handle *trans;
2752         int ret;
2753
2754         async = container_of(work, struct async_delayed_refs, work);
2755
2756         trans = btrfs_join_transaction(async->root);
2757         if (IS_ERR(trans)) {
2758                 async->error = PTR_ERR(trans);
2759                 goto done;
2760         }
2761
2762         /*
2763          * trans->sync means that when we call end_transaciton, we won't
2764          * wait on delayed refs
2765          */
2766         trans->sync = true;
2767         ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2768         if (ret)
2769                 async->error = ret;
2770
2771         ret = btrfs_end_transaction(trans, async->root);
2772         if (ret && !async->error)
2773                 async->error = ret;
2774 done:
2775         if (async->sync)
2776                 complete(&async->wait);
2777         else
2778                 kfree(async);
2779 }
2780
2781 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2782                                  unsigned long count, int wait)
2783 {
2784         struct async_delayed_refs *async;
2785         int ret;
2786
2787         async = kmalloc(sizeof(*async), GFP_NOFS);
2788         if (!async)
2789                 return -ENOMEM;
2790
2791         async->root = root->fs_info->tree_root;
2792         async->count = count;
2793         async->error = 0;
2794         if (wait)
2795                 async->sync = 1;
2796         else
2797                 async->sync = 0;
2798         init_completion(&async->wait);
2799
2800         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2801                         delayed_ref_async_start, NULL, NULL);
2802
2803         btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2804
2805         if (wait) {
2806                 wait_for_completion(&async->wait);
2807                 ret = async->error;
2808                 kfree(async);
2809                 return ret;
2810         }
2811         return 0;
2812 }
2813
2814 /*
2815  * this starts processing the delayed reference count updates and
2816  * extent insertions we have queued up so far.  count can be
2817  * 0, which means to process everything in the tree at the start
2818  * of the run (but not newly added entries), or it can be some target
2819  * number you'd like to process.
2820  *
2821  * Returns 0 on success or if called with an aborted transaction
2822  * Returns <0 on error and aborts the transaction
2823  */
2824 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2825                            struct btrfs_root *root, unsigned long count)
2826 {
2827         struct rb_node *node;
2828         struct btrfs_delayed_ref_root *delayed_refs;
2829         struct btrfs_delayed_ref_head *head;
2830         int ret;
2831         int run_all = count == (unsigned long)-1;
2832
2833         /* We'll clean this up in btrfs_cleanup_transaction */
2834         if (trans->aborted)
2835                 return 0;
2836
2837         if (root == root->fs_info->extent_root)
2838                 root = root->fs_info->tree_root;
2839
2840         delayed_refs = &trans->transaction->delayed_refs;
2841         if (count == 0)
2842                 count = atomic_read(&delayed_refs->num_entries) * 2;
2843
2844 again:
2845 #ifdef SCRAMBLE_DELAYED_REFS
2846         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2847 #endif
2848         ret = __btrfs_run_delayed_refs(trans, root, count);
2849         if (ret < 0) {
2850                 btrfs_abort_transaction(trans, root, ret);
2851                 return ret;
2852         }
2853
2854         if (run_all) {
2855                 if (!list_empty(&trans->new_bgs))
2856                         btrfs_create_pending_block_groups(trans, root);
2857
2858                 spin_lock(&delayed_refs->lock);
2859                 node = rb_first(&delayed_refs->href_root);
2860                 if (!node) {
2861                         spin_unlock(&delayed_refs->lock);
2862                         goto out;
2863                 }
2864                 count = (unsigned long)-1;
2865
2866                 while (node) {
2867                         head = rb_entry(node, struct btrfs_delayed_ref_head,
2868                                         href_node);
2869                         if (btrfs_delayed_ref_is_head(&head->node)) {
2870                                 struct btrfs_delayed_ref_node *ref;
2871
2872                                 ref = &head->node;
2873                                 atomic_inc(&ref->refs);
2874
2875                                 spin_unlock(&delayed_refs->lock);
2876                                 /*
2877                                  * Mutex was contended, block until it's
2878                                  * released and try again
2879                                  */
2880                                 mutex_lock(&head->mutex);
2881                                 mutex_unlock(&head->mutex);
2882
2883                                 btrfs_put_delayed_ref(ref);
2884                                 cond_resched();
2885                                 goto again;
2886                         } else {
2887                                 WARN_ON(1);
2888                         }
2889                         node = rb_next(node);
2890                 }
2891                 spin_unlock(&delayed_refs->lock);
2892                 cond_resched();
2893                 goto again;
2894         }
2895 out:
2896         assert_qgroups_uptodate(trans);
2897         return 0;
2898 }
2899
2900 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2901                                 struct btrfs_root *root,
2902                                 u64 bytenr, u64 num_bytes, u64 flags,
2903                                 int level, int is_data)
2904 {
2905         struct btrfs_delayed_extent_op *extent_op;
2906         int ret;
2907
2908         extent_op = btrfs_alloc_delayed_extent_op();
2909         if (!extent_op)
2910                 return -ENOMEM;
2911
2912         extent_op->flags_to_set = flags;
2913         extent_op->update_flags = 1;
2914         extent_op->update_key = 0;
2915         extent_op->is_data = is_data ? 1 : 0;
2916         extent_op->level = level;
2917
2918         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2919                                           num_bytes, extent_op);
2920         if (ret)
2921                 btrfs_free_delayed_extent_op(extent_op);
2922         return ret;
2923 }
2924
2925 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2926                                       struct btrfs_root *root,
2927                                       struct btrfs_path *path,
2928                                       u64 objectid, u64 offset, u64 bytenr)
2929 {
2930         struct btrfs_delayed_ref_head *head;
2931         struct btrfs_delayed_ref_node *ref;
2932         struct btrfs_delayed_data_ref *data_ref;
2933         struct btrfs_delayed_ref_root *delayed_refs;
2934         int ret = 0;
2935
2936         delayed_refs = &trans->transaction->delayed_refs;
2937         spin_lock(&delayed_refs->lock);
2938         head = btrfs_find_delayed_ref_head(trans, bytenr);
2939         if (!head) {
2940                 spin_unlock(&delayed_refs->lock);
2941                 return 0;
2942         }
2943
2944         if (!mutex_trylock(&head->mutex)) {
2945                 atomic_inc(&head->node.refs);
2946                 spin_unlock(&delayed_refs->lock);
2947
2948                 btrfs_release_path(path);
2949
2950                 /*
2951                  * Mutex was contended, block until it's released and let
2952                  * caller try again
2953                  */
2954                 mutex_lock(&head->mutex);
2955                 mutex_unlock(&head->mutex);
2956                 btrfs_put_delayed_ref(&head->node);
2957                 return -EAGAIN;
2958         }
2959         spin_unlock(&delayed_refs->lock);
2960
2961         spin_lock(&head->lock);
2962         list_for_each_entry(ref, &head->ref_list, list) {
2963                 /* If it's a shared ref we know a cross reference exists */
2964                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2965                         ret = 1;
2966                         break;
2967                 }
2968
2969                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2970
2971                 /*
2972                  * If our ref doesn't match the one we're currently looking at
2973                  * then we have a cross reference.
2974                  */
2975                 if (data_ref->root != root->root_key.objectid ||
2976                     data_ref->objectid != objectid ||
2977                     data_ref->offset != offset) {
2978                         ret = 1;
2979                         break;
2980                 }
2981         }
2982         spin_unlock(&head->lock);
2983         mutex_unlock(&head->mutex);
2984         return ret;
2985 }
2986
2987 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2988                                         struct btrfs_root *root,
2989                                         struct btrfs_path *path,
2990                                         u64 objectid, u64 offset, u64 bytenr)
2991 {
2992         struct btrfs_root *extent_root = root->fs_info->extent_root;
2993         struct extent_buffer *leaf;
2994         struct btrfs_extent_data_ref *ref;
2995         struct btrfs_extent_inline_ref *iref;
2996         struct btrfs_extent_item *ei;
2997         struct btrfs_key key;
2998         u32 item_size;
2999         int ret;
3000
3001         key.objectid = bytenr;
3002         key.offset = (u64)-1;
3003         key.type = BTRFS_EXTENT_ITEM_KEY;
3004
3005         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3006         if (ret < 0)
3007                 goto out;
3008         BUG_ON(ret == 0); /* Corruption */
3009
3010         ret = -ENOENT;
3011         if (path->slots[0] == 0)
3012                 goto out;
3013
3014         path->slots[0]--;
3015         leaf = path->nodes[0];
3016         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3017
3018         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3019                 goto out;
3020
3021         ret = 1;
3022         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3023 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3024         if (item_size < sizeof(*ei)) {
3025                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3026                 goto out;
3027         }
3028 #endif
3029         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3030
3031         if (item_size != sizeof(*ei) +
3032             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3033                 goto out;
3034
3035         if (btrfs_extent_generation(leaf, ei) <=
3036             btrfs_root_last_snapshot(&root->root_item))
3037                 goto out;
3038
3039         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3040         if (btrfs_extent_inline_ref_type(leaf, iref) !=
3041             BTRFS_EXTENT_DATA_REF_KEY)
3042                 goto out;
3043
3044         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3045         if (btrfs_extent_refs(leaf, ei) !=
3046             btrfs_extent_data_ref_count(leaf, ref) ||
3047             btrfs_extent_data_ref_root(leaf, ref) !=
3048             root->root_key.objectid ||
3049             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3050             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3051                 goto out;
3052
3053         ret = 0;
3054 out:
3055         return ret;
3056 }
3057
3058 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3059                           struct btrfs_root *root,
3060                           u64 objectid, u64 offset, u64 bytenr)
3061 {
3062         struct btrfs_path *path;
3063         int ret;
3064         int ret2;
3065
3066         path = btrfs_alloc_path();
3067         if (!path)
3068                 return -ENOENT;
3069
3070         do {
3071                 ret = check_committed_ref(trans, root, path, objectid,
3072                                           offset, bytenr);
3073                 if (ret && ret != -ENOENT)
3074                         goto out;
3075
3076                 ret2 = check_delayed_ref(trans, root, path, objectid,
3077                                          offset, bytenr);
3078         } while (ret2 == -EAGAIN);
3079
3080         if (ret2 && ret2 != -ENOENT) {
3081                 ret = ret2;
3082                 goto out;
3083         }
3084
3085         if (ret != -ENOENT || ret2 != -ENOENT)
3086                 ret = 0;
3087 out:
3088         btrfs_free_path(path);
3089         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3090                 WARN_ON(ret > 0);
3091         return ret;
3092 }
3093
3094 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3095                            struct btrfs_root *root,
3096                            struct extent_buffer *buf,
3097                            int full_backref, int inc)
3098 {
3099         u64 bytenr;
3100         u64 num_bytes;
3101         u64 parent;
3102         u64 ref_root;
3103         u32 nritems;
3104         struct btrfs_key key;
3105         struct btrfs_file_extent_item *fi;
3106         int i;
3107         int level;
3108         int ret = 0;
3109         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3110                             u64, u64, u64, u64, u64, u64, int);
3111
3112
3113         if (btrfs_test_is_dummy_root(root))
3114                 return 0;
3115
3116         ref_root = btrfs_header_owner(buf);
3117         nritems = btrfs_header_nritems(buf);
3118         level = btrfs_header_level(buf);
3119
3120         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3121                 return 0;
3122
3123         if (inc)
3124                 process_func = btrfs_inc_extent_ref;
3125         else
3126                 process_func = btrfs_free_extent;
3127
3128         if (full_backref)
3129                 parent = buf->start;
3130         else
3131                 parent = 0;
3132
3133         for (i = 0; i < nritems; i++) {
3134                 if (level == 0) {
3135                         btrfs_item_key_to_cpu(buf, &key, i);
3136                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3137                                 continue;
3138                         fi = btrfs_item_ptr(buf, i,
3139                                             struct btrfs_file_extent_item);
3140                         if (btrfs_file_extent_type(buf, fi) ==
3141                             BTRFS_FILE_EXTENT_INLINE)
3142                                 continue;
3143                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3144                         if (bytenr == 0)
3145                                 continue;
3146
3147                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3148                         key.offset -= btrfs_file_extent_offset(buf, fi);
3149                         ret = process_func(trans, root, bytenr, num_bytes,
3150                                            parent, ref_root, key.objectid,
3151                                            key.offset, 1);
3152                         if (ret)
3153                                 goto fail;
3154                 } else {
3155                         bytenr = btrfs_node_blockptr(buf, i);
3156                         num_bytes = root->nodesize;
3157                         ret = process_func(trans, root, bytenr, num_bytes,
3158                                            parent, ref_root, level - 1, 0,
3159                                            1);
3160                         if (ret)
3161                                 goto fail;
3162                 }
3163         }
3164         return 0;
3165 fail:
3166         return ret;
3167 }
3168
3169 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3170                   struct extent_buffer *buf, int full_backref)
3171 {
3172         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3173 }
3174
3175 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3176                   struct extent_buffer *buf, int full_backref)
3177 {
3178         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3179 }
3180
3181 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3182                                  struct btrfs_root *root,
3183                                  struct btrfs_path *path,
3184                                  struct btrfs_block_group_cache *cache)
3185 {
3186         int ret;
3187         struct btrfs_root *extent_root = root->fs_info->extent_root;
3188         unsigned long bi;
3189         struct extent_buffer *leaf;
3190
3191         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3192         if (ret) {
3193                 if (ret > 0)
3194                         ret = -ENOENT;
3195                 goto fail;
3196         }
3197
3198         leaf = path->nodes[0];
3199         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3200         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3201         btrfs_mark_buffer_dirty(leaf);
3202 fail:
3203         btrfs_release_path(path);
3204         return ret;
3205
3206 }
3207
3208 static struct btrfs_block_group_cache *
3209 next_block_group(struct btrfs_root *root,
3210                  struct btrfs_block_group_cache *cache)
3211 {
3212         struct rb_node *node;
3213
3214         spin_lock(&root->fs_info->block_group_cache_lock);
3215
3216         /* If our block group was removed, we need a full search. */
3217         if (RB_EMPTY_NODE(&cache->cache_node)) {
3218                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3219
3220                 spin_unlock(&root->fs_info->block_group_cache_lock);
3221                 btrfs_put_block_group(cache);
3222                 cache = btrfs_lookup_first_block_group(root->fs_info,
3223                                                        next_bytenr);
3224                 return cache;
3225         }
3226         node = rb_next(&cache->cache_node);
3227         btrfs_put_block_group(cache);
3228         if (node) {
3229                 cache = rb_entry(node, struct btrfs_block_group_cache,
3230                                  cache_node);
3231                 btrfs_get_block_group(cache);
3232         } else
3233                 cache = NULL;
3234         spin_unlock(&root->fs_info->block_group_cache_lock);
3235         return cache;
3236 }
3237
3238 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3239                             struct btrfs_trans_handle *trans,
3240                             struct btrfs_path *path)
3241 {
3242         struct btrfs_root *root = block_group->fs_info->tree_root;
3243         struct inode *inode = NULL;
3244         u64 alloc_hint = 0;
3245         int dcs = BTRFS_DC_ERROR;
3246         u64 num_pages = 0;
3247         int retries = 0;
3248         int ret = 0;
3249
3250         /*
3251          * If this block group is smaller than 100 megs don't bother caching the
3252          * block group.
3253          */
3254         if (block_group->key.offset < (100 * 1024 * 1024)) {
3255                 spin_lock(&block_group->lock);
3256                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3257                 spin_unlock(&block_group->lock);
3258                 return 0;
3259         }
3260
3261         if (trans->aborted)
3262                 return 0;
3263 again:
3264         inode = lookup_free_space_inode(root, block_group, path);
3265         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3266                 ret = PTR_ERR(inode);
3267                 btrfs_release_path(path);
3268                 goto out;
3269         }
3270
3271         if (IS_ERR(inode)) {
3272                 BUG_ON(retries);
3273                 retries++;
3274
3275                 if (block_group->ro)
3276                         goto out_free;
3277
3278                 ret = create_free_space_inode(root, trans, block_group, path);
3279                 if (ret)
3280                         goto out_free;
3281                 goto again;
3282         }
3283
3284         /* We've already setup this transaction, go ahead and exit */
3285         if (block_group->cache_generation == trans->transid &&
3286             i_size_read(inode)) {
3287                 dcs = BTRFS_DC_SETUP;
3288                 goto out_put;
3289         }
3290
3291         /*
3292          * We want to set the generation to 0, that way if anything goes wrong
3293          * from here on out we know not to trust this cache when we load up next
3294          * time.
3295          */
3296         BTRFS_I(inode)->generation = 0;
3297         ret = btrfs_update_inode(trans, root, inode);
3298         if (ret) {
3299                 /*
3300                  * So theoretically we could recover from this, simply set the
3301                  * super cache generation to 0 so we know to invalidate the
3302                  * cache, but then we'd have to keep track of the block groups
3303                  * that fail this way so we know we _have_ to reset this cache
3304                  * before the next commit or risk reading stale cache.  So to
3305                  * limit our exposure to horrible edge cases lets just abort the
3306                  * transaction, this only happens in really bad situations
3307                  * anyway.
3308                  */
3309                 btrfs_abort_transaction(trans, root, ret);
3310                 goto out_put;
3311         }
3312         WARN_ON(ret);
3313
3314         if (i_size_read(inode) > 0) {
3315                 ret = btrfs_check_trunc_cache_free_space(root,
3316                                         &root->fs_info->global_block_rsv);
3317                 if (ret)
3318                         goto out_put;
3319
3320                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3321                 if (ret)
3322                         goto out_put;
3323         }
3324
3325         spin_lock(&block_group->lock);
3326         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3327             !btrfs_test_opt(root, SPACE_CACHE)) {
3328                 /*
3329                  * don't bother trying to write stuff out _if_
3330                  * a) we're not cached,
3331                  * b) we're with nospace_cache mount option.
3332                  */
3333                 dcs = BTRFS_DC_WRITTEN;
3334                 spin_unlock(&block_group->lock);
3335                 goto out_put;
3336         }
3337         spin_unlock(&block_group->lock);
3338
3339         /*
3340          * Try to preallocate enough space based on how big the block group is.
3341          * Keep in mind this has to include any pinned space which could end up
3342          * taking up quite a bit since it's not folded into the other space
3343          * cache.
3344          */
3345         num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
3346         if (!num_pages)
3347                 num_pages = 1;
3348
3349         num_pages *= 16;
3350         num_pages *= PAGE_CACHE_SIZE;
3351
3352         ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
3353         if (ret)
3354                 goto out_put;
3355
3356         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3357                                               num_pages, num_pages,
3358                                               &alloc_hint);
3359         if (!ret)
3360                 dcs = BTRFS_DC_SETUP;
3361         btrfs_free_reserved_data_space(inode, num_pages);
3362
3363 out_put:
3364         iput(inode);
3365 out_free:
3366         btrfs_release_path(path);
3367 out:
3368         spin_lock(&block_group->lock);
3369         if (!ret && dcs == BTRFS_DC_SETUP)
3370                 block_group->cache_generation = trans->transid;
3371         block_group->disk_cache_state = dcs;
3372         spin_unlock(&block_group->lock);
3373
3374         return ret;
3375 }
3376
3377 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3378                             struct btrfs_root *root)
3379 {
3380         struct btrfs_block_group_cache *cache, *tmp;
3381         struct btrfs_transaction *cur_trans = trans->transaction;
3382         struct btrfs_path *path;
3383
3384         if (list_empty(&cur_trans->dirty_bgs) ||
3385             !btrfs_test_opt(root, SPACE_CACHE))
3386                 return 0;
3387
3388         path = btrfs_alloc_path();
3389         if (!path)
3390                 return -ENOMEM;
3391
3392         /* Could add new block groups, use _safe just in case */
3393         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3394                                  dirty_list) {
3395                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3396                         cache_save_setup(cache, trans, path);
3397         }
3398
3399         btrfs_free_path(path);
3400         return 0;
3401 }
3402
3403 /*
3404  * transaction commit does final block group cache writeback during a
3405  * critical section where nothing is allowed to change the FS.  This is
3406  * required in order for the cache to actually match the block group,
3407  * but can introduce a lot of latency into the commit.
3408  *
3409  * So, btrfs_start_dirty_block_groups is here to kick off block group
3410  * cache IO.  There's a chance we'll have to redo some of it if the
3411  * block group changes again during the commit, but it greatly reduces
3412  * the commit latency by getting rid of the easy block groups while
3413  * we're still allowing others to join the commit.
3414  */
3415 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3416                                    struct btrfs_root *root)
3417 {
3418         struct btrfs_block_group_cache *cache;
3419         struct btrfs_transaction *cur_trans = trans->transaction;
3420         int ret = 0;
3421         int should_put;
3422         struct btrfs_path *path = NULL;
3423         LIST_HEAD(dirty);
3424         struct list_head *io = &cur_trans->io_bgs;
3425         int num_started = 0;
3426         int loops = 0;
3427
3428         spin_lock(&cur_trans->dirty_bgs_lock);
3429         if (list_empty(&cur_trans->dirty_bgs)) {
3430                 spin_unlock(&cur_trans->dirty_bgs_lock);
3431                 return 0;
3432         }
3433         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3434         spin_unlock(&cur_trans->dirty_bgs_lock);
3435
3436 again:
3437         /*
3438          * make sure all the block groups on our dirty list actually
3439          * exist
3440          */
3441         btrfs_create_pending_block_groups(trans, root);
3442
3443         if (!path) {
3444                 path = btrfs_alloc_path();
3445                 if (!path)
3446                         return -ENOMEM;
3447         }
3448
3449         /*
3450          * cache_write_mutex is here only to save us from balance or automatic
3451          * removal of empty block groups deleting this block group while we are
3452          * writing out the cache
3453          */
3454         mutex_lock(&trans->transaction->cache_write_mutex);
3455         while (!list_empty(&dirty)) {
3456                 cache = list_first_entry(&dirty,
3457                                          struct btrfs_block_group_cache,
3458                                          dirty_list);
3459                 /*
3460                  * this can happen if something re-dirties a block
3461                  * group that is already under IO.  Just wait for it to
3462                  * finish and then do it all again
3463                  */
3464                 if (!list_empty(&cache->io_list)) {
3465                         list_del_init(&cache->io_list);
3466                         btrfs_wait_cache_io(root, trans, cache,
3467                                             &cache->io_ctl, path,
3468                                             cache->key.objectid);
3469                         btrfs_put_block_group(cache);
3470                 }
3471
3472
3473                 /*
3474                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3475                  * if it should update the cache_state.  Don't delete
3476                  * until after we wait.
3477                  *
3478                  * Since we're not running in the commit critical section
3479                  * we need the dirty_bgs_lock to protect from update_block_group
3480                  */
3481                 spin_lock(&cur_trans->dirty_bgs_lock);
3482                 list_del_init(&cache->dirty_list);
3483                 spin_unlock(&cur_trans->dirty_bgs_lock);
3484
3485                 should_put = 1;
3486
3487                 cache_save_setup(cache, trans, path);
3488
3489                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3490                         cache->io_ctl.inode = NULL;
3491                         ret = btrfs_write_out_cache(root, trans, cache, path);
3492                         if (ret == 0 && cache->io_ctl.inode) {
3493                                 num_started++;
3494                                 should_put = 0;
3495
3496                                 /*
3497                                  * the cache_write_mutex is protecting
3498                                  * the io_list
3499                                  */
3500                                 list_add_tail(&cache->io_list, io);
3501                         } else {
3502                                 /*
3503                                  * if we failed to write the cache, the
3504                                  * generation will be bad and life goes on
3505                                  */
3506                                 ret = 0;
3507                         }
3508                 }
3509                 if (!ret) {
3510                         ret = write_one_cache_group(trans, root, path, cache);
3511                         /*
3512                          * Our block group might still be attached to the list
3513                          * of new block groups in the transaction handle of some
3514                          * other task (struct btrfs_trans_handle->new_bgs). This
3515                          * means its block group item isn't yet in the extent
3516                          * tree. If this happens ignore the error, as we will
3517                          * try again later in the critical section of the
3518                          * transaction commit.
3519                          */
3520                         if (ret == -ENOENT) {
3521                                 ret = 0;
3522                                 spin_lock(&cur_trans->dirty_bgs_lock);
3523                                 if (list_empty(&cache->dirty_list)) {
3524                                         list_add_tail(&cache->dirty_list,
3525                                                       &cur_trans->dirty_bgs);
3526                                         btrfs_get_block_group(cache);
3527                                 }
3528                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3529                         } else if (ret) {
3530                                 btrfs_abort_transaction(trans, root, ret);
3531                         }
3532                 }
3533
3534                 /* if its not on the io list, we need to put the block group */
3535                 if (should_put)
3536                         btrfs_put_block_group(cache);
3537
3538                 if (ret)
3539                         break;
3540
3541                 /*
3542                  * Avoid blocking other tasks for too long. It might even save
3543                  * us from writing caches for block groups that are going to be
3544                  * removed.
3545                  */
3546                 mutex_unlock(&trans->transaction->cache_write_mutex);
3547                 mutex_lock(&trans->transaction->cache_write_mutex);
3548         }
3549         mutex_unlock(&trans->transaction->cache_write_mutex);
3550
3551         /*
3552          * go through delayed refs for all the stuff we've just kicked off
3553          * and then loop back (just once)
3554          */
3555         ret = btrfs_run_delayed_refs(trans, root, 0);
3556         if (!ret && loops == 0) {
3557                 loops++;
3558                 spin_lock(&cur_trans->dirty_bgs_lock);
3559                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3560                 /*
3561                  * dirty_bgs_lock protects us from concurrent block group
3562                  * deletes too (not just cache_write_mutex).
3563                  */
3564                 if (!list_empty(&dirty)) {
3565                         spin_unlock(&cur_trans->dirty_bgs_lock);
3566                         goto again;
3567                 }
3568                 spin_unlock(&cur_trans->dirty_bgs_lock);
3569         }
3570
3571         btrfs_free_path(path);
3572         return ret;
3573 }
3574
3575 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3576                                    struct btrfs_root *root)
3577 {
3578         struct btrfs_block_group_cache *cache;
3579         struct btrfs_transaction *cur_trans = trans->transaction;
3580         int ret = 0;
3581         int should_put;
3582         struct btrfs_path *path;
3583         struct list_head *io = &cur_trans->io_bgs;
3584         int num_started = 0;
3585
3586         path = btrfs_alloc_path();
3587         if (!path)
3588                 return -ENOMEM;
3589
3590         /*
3591          * We don't need the lock here since we are protected by the transaction
3592          * commit.  We want to do the cache_save_setup first and then run the
3593          * delayed refs to make sure we have the best chance at doing this all
3594          * in one shot.
3595          */
3596         while (!list_empty(&cur_trans->dirty_bgs)) {
3597                 cache = list_first_entry(&cur_trans->dirty_bgs,
3598                                          struct btrfs_block_group_cache,
3599                                          dirty_list);
3600
3601                 /*
3602                  * this can happen if cache_save_setup re-dirties a block
3603                  * group that is already under IO.  Just wait for it to
3604                  * finish and then do it all again
3605                  */
3606                 if (!list_empty(&cache->io_list)) {
3607                         list_del_init(&cache->io_list);
3608                         btrfs_wait_cache_io(root, trans, cache,
3609                                             &cache->io_ctl, path,
3610                                             cache->key.objectid);
3611                         btrfs_put_block_group(cache);
3612                 }
3613
3614                 /*
3615                  * don't remove from the dirty list until after we've waited
3616                  * on any pending IO
3617                  */
3618                 list_del_init(&cache->dirty_list);
3619                 should_put = 1;
3620
3621                 cache_save_setup(cache, trans, path);
3622
3623                 if (!ret)
3624                         ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3625
3626                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3627                         cache->io_ctl.inode = NULL;
3628                         ret = btrfs_write_out_cache(root, trans, cache, path);
3629                         if (ret == 0 && cache->io_ctl.inode) {
3630                                 num_started++;
3631                                 should_put = 0;
3632                                 list_add_tail(&cache->io_list, io);
3633                         } else {
3634                                 /*
3635                                  * if we failed to write the cache, the
3636                                  * generation will be bad and life goes on
3637                                  */
3638                                 ret = 0;
3639                         }
3640                 }
3641                 if (!ret) {
3642                         ret = write_one_cache_group(trans, root, path, cache);
3643                         if (ret)
3644                                 btrfs_abort_transaction(trans, root, ret);
3645                 }
3646
3647                 /* if its not on the io list, we need to put the block group */
3648                 if (should_put)
3649                         btrfs_put_block_group(cache);
3650         }
3651
3652         while (!list_empty(io)) {
3653                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3654                                          io_list);
3655                 list_del_init(&cache->io_list);
3656                 btrfs_wait_cache_io(root, trans, cache,
3657                                     &cache->io_ctl, path, cache->key.objectid);
3658                 btrfs_put_block_group(cache);
3659         }
3660
3661         btrfs_free_path(path);
3662         return ret;
3663 }
3664
3665 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3666 {
3667         struct btrfs_block_group_cache *block_group;
3668         int readonly = 0;
3669
3670         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3671         if (!block_group || block_group->ro)
3672                 readonly = 1;
3673         if (block_group)
3674                 btrfs_put_block_group(block_group);
3675         return readonly;
3676 }
3677
3678 static const char *alloc_name(u64 flags)
3679 {
3680         switch (flags) {
3681         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3682                 return "mixed";
3683         case BTRFS_BLOCK_GROUP_METADATA:
3684                 return "metadata";
3685         case BTRFS_BLOCK_GROUP_DATA:
3686                 return "data";
3687         case BTRFS_BLOCK_GROUP_SYSTEM:
3688                 return "system";
3689         default:
3690                 WARN_ON(1);
3691                 return "invalid-combination";
3692         };
3693 }
3694
3695 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3696                              u64 total_bytes, u64 bytes_used,
3697                              struct btrfs_space_info **space_info)
3698 {
3699         struct btrfs_space_info *found;
3700         int i;
3701         int factor;
3702         int ret;
3703
3704         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3705                      BTRFS_BLOCK_GROUP_RAID10))
3706                 factor = 2;
3707         else
3708                 factor = 1;
3709
3710         found = __find_space_info(info, flags);
3711         if (found) {
3712                 spin_lock(&found->lock);
3713                 found->total_bytes += total_bytes;
3714                 found->disk_total += total_bytes * factor;
3715                 found->bytes_used += bytes_used;
3716                 found->disk_used += bytes_used * factor;
3717                 if (total_bytes > 0)
3718                         found->full = 0;
3719                 spin_unlock(&found->lock);
3720                 *space_info = found;
3721                 return 0;
3722         }
3723         found = kzalloc(sizeof(*found), GFP_NOFS);
3724         if (!found)
3725                 return -ENOMEM;
3726
3727         ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3728         if (ret) {
3729                 kfree(found);
3730                 return ret;
3731         }
3732
3733         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3734                 INIT_LIST_HEAD(&found->block_groups[i]);
3735         init_rwsem(&found->groups_sem);
3736         spin_lock_init(&found->lock);
3737         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3738         found->total_bytes = total_bytes;
3739         found->disk_total = total_bytes * factor;
3740         found->bytes_used = bytes_used;
3741         found->disk_used = bytes_used * factor;
3742         found->bytes_pinned = 0;
3743         found->bytes_reserved = 0;
3744         found->bytes_readonly = 0;
3745         found->bytes_may_use = 0;
3746         if (total_bytes > 0)
3747                 found->full = 0;
3748         else
3749                 found->full = 1;
3750         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3751         found->chunk_alloc = 0;
3752         found->flush = 0;
3753         init_waitqueue_head(&found->wait);
3754         INIT_LIST_HEAD(&found->ro_bgs);
3755
3756         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3757                                     info->space_info_kobj, "%s",
3758                                     alloc_name(found->flags));
3759         if (ret) {
3760                 kfree(found);
3761                 return ret;
3762         }
3763
3764         *space_info = found;
3765         list_add_rcu(&found->list, &info->space_info);
3766         if (flags & BTRFS_BLOCK_GROUP_DATA)
3767                 info->data_sinfo = found;
3768
3769         return ret;
3770 }
3771
3772 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3773 {
3774         u64 extra_flags = chunk_to_extended(flags) &
3775                                 BTRFS_EXTENDED_PROFILE_MASK;
3776
3777         write_seqlock(&fs_info->profiles_lock);
3778         if (flags & BTRFS_BLOCK_GROUP_DATA)
3779                 fs_info->avail_data_alloc_bits |= extra_flags;
3780         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3781                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3782         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3783                 fs_info->avail_system_alloc_bits |= extra_flags;
3784         write_sequnlock(&fs_info->profiles_lock);
3785 }
3786
3787 /*
3788  * returns target flags in extended format or 0 if restripe for this
3789  * chunk_type is not in progress
3790  *
3791  * should be called with either volume_mutex or balance_lock held
3792  */
3793 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3794 {
3795         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3796         u64 target = 0;
3797
3798         if (!bctl)
3799                 return 0;
3800
3801         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3802             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3803                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3804         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3805                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3806                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3807         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3808                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3809                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3810         }
3811
3812         return target;
3813 }
3814
3815 /*
3816  * @flags: available profiles in extended format (see ctree.h)
3817  *
3818  * Returns reduced profile in chunk format.  If profile changing is in
3819  * progress (either running or paused) picks the target profile (if it's
3820  * already available), otherwise falls back to plain reducing.
3821  */
3822 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3823 {
3824         u64 num_devices = root->fs_info->fs_devices->rw_devices;
3825         u64 target;
3826         u64 tmp;
3827
3828         /*
3829          * see if restripe for this chunk_type is in progress, if so
3830          * try to reduce to the target profile
3831          */
3832         spin_lock(&root->fs_info->balance_lock);
3833         target = get_restripe_target(root->fs_info, flags);
3834         if (target) {
3835                 /* pick target profile only if it's already available */
3836                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3837                         spin_unlock(&root->fs_info->balance_lock);
3838                         return extended_to_chunk(target);
3839                 }
3840         }
3841         spin_unlock(&root->fs_info->balance_lock);
3842
3843         /* First, mask out the RAID levels which aren't possible */
3844         if (num_devices == 1)
3845                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3846                            BTRFS_BLOCK_GROUP_RAID5);
3847         if (num_devices < 3)
3848                 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3849         if (num_devices < 4)
3850                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3851
3852         tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3853                        BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3854                        BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3855         flags &= ~tmp;
3856
3857         if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3858                 tmp = BTRFS_BLOCK_GROUP_RAID6;
3859         else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3860                 tmp = BTRFS_BLOCK_GROUP_RAID5;
3861         else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3862                 tmp = BTRFS_BLOCK_GROUP_RAID10;
3863         else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3864                 tmp = BTRFS_BLOCK_GROUP_RAID1;
3865         else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3866                 tmp = BTRFS_BLOCK_GROUP_RAID0;
3867
3868         return extended_to_chunk(flags | tmp);
3869 }
3870
3871 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3872 {
3873         unsigned seq;
3874         u64 flags;
3875
3876         do {
3877                 flags = orig_flags;
3878                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3879
3880                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3881                         flags |= root->fs_info->avail_data_alloc_bits;
3882                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3883                         flags |= root->fs_info->avail_system_alloc_bits;
3884                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3885                         flags |= root->fs_info->avail_metadata_alloc_bits;
3886         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3887
3888         return btrfs_reduce_alloc_profile(root, flags);
3889 }
3890
3891 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3892 {
3893         u64 flags;
3894         u64 ret;
3895
3896         if (data)
3897                 flags = BTRFS_BLOCK_GROUP_DATA;
3898         else if (root == root->fs_info->chunk_root)
3899                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3900         else
3901                 flags = BTRFS_BLOCK_GROUP_METADATA;
3902
3903         ret = get_alloc_profile(root, flags);
3904         return ret;
3905 }
3906
3907 /*
3908  * This will check the space that the inode allocates from to make sure we have
3909  * enough space for bytes.
3910  */
3911 int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
3912 {
3913         struct btrfs_space_info *data_sinfo;
3914         struct btrfs_root *root = BTRFS_I(inode)->root;
3915         struct btrfs_fs_info *fs_info = root->fs_info;
3916         u64 used;
3917         int ret = 0;
3918         int need_commit = 2;
3919         int have_pinned_space;
3920
3921         /* make sure bytes are sectorsize aligned */
3922         bytes = ALIGN(bytes, root->sectorsize);
3923
3924         if (btrfs_is_free_space_inode(inode)) {
3925                 need_commit = 0;
3926                 ASSERT(current->journal_info);
3927         }
3928
3929         data_sinfo = fs_info->data_sinfo;
3930         if (!data_sinfo)
3931                 goto alloc;
3932
3933 again:
3934         /* make sure we have enough space to handle the data first */
3935         spin_lock(&data_sinfo->lock);
3936         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3937                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3938                 data_sinfo->bytes_may_use;
3939
3940         if (used + bytes > data_sinfo->total_bytes) {
3941                 struct btrfs_trans_handle *trans;
3942
3943                 /*
3944                  * if we don't have enough free bytes in this space then we need
3945                  * to alloc a new chunk.
3946                  */
3947                 if (!data_sinfo->full) {
3948                         u64 alloc_target;
3949
3950                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3951                         spin_unlock(&data_sinfo->lock);
3952 alloc:
3953                         alloc_target = btrfs_get_alloc_profile(root, 1);
3954                         /*
3955                          * It is ugly that we don't call nolock join
3956                          * transaction for the free space inode case here.
3957                          * But it is safe because we only do the data space
3958                          * reservation for the free space cache in the
3959                          * transaction context, the common join transaction
3960                          * just increase the counter of the current transaction
3961                          * handler, doesn't try to acquire the trans_lock of
3962                          * the fs.
3963                          */
3964                         trans = btrfs_join_transaction(root);
3965                         if (IS_ERR(trans))
3966                                 return PTR_ERR(trans);
3967
3968                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3969                                              alloc_target,
3970                                              CHUNK_ALLOC_NO_FORCE);
3971                         btrfs_end_transaction(trans, root);
3972                         if (ret < 0) {
3973                                 if (ret != -ENOSPC)
3974                                         return ret;
3975                                 else {
3976                                         have_pinned_space = 1;
3977                                         goto commit_trans;
3978                                 }
3979                         }
3980
3981                         if (!data_sinfo)
3982                                 data_sinfo = fs_info->data_sinfo;
3983
3984                         goto again;
3985                 }
3986
3987                 /*
3988                  * If we don't have enough pinned space to deal with this
3989                  * allocation, and no removed chunk in current transaction,
3990                  * don't bother committing the transaction.
3991                  */
3992                 have_pinned_space = percpu_counter_compare(
3993                         &data_sinfo->total_bytes_pinned,
3994                         used + bytes - data_sinfo->total_bytes);
3995                 spin_unlock(&data_sinfo->lock);
3996
3997                 /* commit the current transaction and try again */
3998 commit_trans:
3999                 if (need_commit &&
4000                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
4001                         need_commit--;
4002
4003                         if (need_commit > 0)
4004                                 btrfs_wait_ordered_roots(fs_info, -1);
4005
4006                         trans = btrfs_join_transaction(root);
4007                         if (IS_ERR(trans))
4008                                 return PTR_ERR(trans);
4009                         if (have_pinned_space >= 0 ||
4010                             trans->transaction->have_free_bgs ||
4011                             need_commit > 0) {
4012                                 ret = btrfs_commit_transaction(trans, root);
4013                                 if (ret)
4014                                         return ret;
4015                                 /*
4016                                  * make sure that all running delayed iput are
4017                                  * done
4018                                  */
4019                                 down_write(&root->fs_info->delayed_iput_sem);
4020                                 up_write(&root->fs_info->delayed_iput_sem);
4021                                 goto again;
4022                         } else {
4023                                 btrfs_end_transaction(trans, root);
4024                         }
4025                 }
4026
4027                 trace_btrfs_space_reservation(root->fs_info,
4028                                               "space_info:enospc",
4029                                               data_sinfo->flags, bytes, 1);
4030                 return -ENOSPC;
4031         }
4032         ret = btrfs_qgroup_reserve(root, write_bytes);
4033         if (ret)
4034                 goto out;
4035         data_sinfo->bytes_may_use += bytes;
4036         trace_btrfs_space_reservation(root->fs_info, "space_info",
4037                                       data_sinfo->flags, bytes, 1);
4038 out:
4039         spin_unlock(&data_sinfo->lock);
4040
4041         return ret;
4042 }
4043
4044 /*
4045  * Called if we need to clear a data reservation for this inode.
4046  */
4047 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
4048 {
4049         struct btrfs_root *root = BTRFS_I(inode)->root;
4050         struct btrfs_space_info *data_sinfo;
4051
4052         /* make sure bytes are sectorsize aligned */
4053         bytes = ALIGN(bytes, root->sectorsize);
4054
4055         data_sinfo = root->fs_info->data_sinfo;
4056         spin_lock(&data_sinfo->lock);
4057         WARN_ON(data_sinfo->bytes_may_use < bytes);
4058         data_sinfo->bytes_may_use -= bytes;
4059         trace_btrfs_space_reservation(root->fs_info, "space_info",
4060                                       data_sinfo->flags, bytes, 0);
4061         spin_unlock(&data_sinfo->lock);
4062 }
4063
4064 static void force_metadata_allocation(struct btrfs_fs_info *info)
4065 {
4066         struct list_head *head = &info->space_info;
4067         struct btrfs_space_info *found;
4068
4069         rcu_read_lock();
4070         list_for_each_entry_rcu(found, head, list) {
4071                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4072                         found->force_alloc = CHUNK_ALLOC_FORCE;
4073         }
4074         rcu_read_unlock();
4075 }
4076
4077 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4078 {
4079         return (global->size << 1);
4080 }
4081
4082 static int should_alloc_chunk(struct btrfs_root *root,
4083                               struct btrfs_space_info *sinfo, int force)
4084 {
4085         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4086         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4087         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4088         u64 thresh;
4089
4090         if (force == CHUNK_ALLOC_FORCE)
4091                 return 1;
4092
4093         /*
4094          * We need to take into account the global rsv because for all intents
4095          * and purposes it's used space.  Don't worry about locking the
4096          * global_rsv, it doesn't change except when the transaction commits.
4097          */
4098         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4099                 num_allocated += calc_global_rsv_need_space(global_rsv);
4100
4101         /*
4102          * in limited mode, we want to have some free space up to
4103          * about 1% of the FS size.
4104          */
4105         if (force == CHUNK_ALLOC_LIMITED) {
4106                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4107                 thresh = max_t(u64, 64 * 1024 * 1024,
4108                                div_factor_fine(thresh, 1));
4109
4110                 if (num_bytes - num_allocated < thresh)
4111                         return 1;
4112         }
4113
4114         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
4115                 return 0;
4116         return 1;
4117 }
4118
4119 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4120 {
4121         u64 num_dev;
4122
4123         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4124                     BTRFS_BLOCK_GROUP_RAID0 |
4125                     BTRFS_BLOCK_GROUP_RAID5 |
4126                     BTRFS_BLOCK_GROUP_RAID6))
4127                 num_dev = root->fs_info->fs_devices->rw_devices;
4128         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4129                 num_dev = 2;
4130         else
4131                 num_dev = 1;    /* DUP or single */
4132
4133         return num_dev;
4134 }
4135
4136 /*
4137  * If @is_allocation is true, reserve space in the system space info necessary
4138  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4139  * removing a chunk.
4140  */
4141 void check_system_chunk(struct btrfs_trans_handle *trans,
4142                         struct btrfs_root *root,
4143                         u64 type)
4144 {
4145         struct btrfs_space_info *info;
4146         u64 left;
4147         u64 thresh;
4148         int ret = 0;
4149         u64 num_devs;
4150
4151         /*
4152          * Needed because we can end up allocating a system chunk and for an
4153          * atomic and race free space reservation in the chunk block reserve.
4154          */
4155         ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4156
4157         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4158         spin_lock(&info->lock);
4159         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4160                 info->bytes_reserved - info->bytes_readonly -
4161                 info->bytes_may_use;
4162         spin_unlock(&info->lock);
4163
4164         num_devs = get_profile_num_devs(root, type);
4165
4166         /* num_devs device items to update and 1 chunk item to add or remove */
4167         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4168                 btrfs_calc_trans_metadata_size(root, 1);
4169
4170         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
4171                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4172                         left, thresh, type);
4173                 dump_space_info(info, 0, 0);
4174         }
4175
4176         if (left < thresh) {
4177                 u64 flags;
4178
4179                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4180                 /*
4181                  * Ignore failure to create system chunk. We might end up not
4182                  * needing it, as we might not need to COW all nodes/leafs from
4183                  * the paths we visit in the chunk tree (they were already COWed
4184                  * or created in the current transaction for example).
4185                  */
4186                 ret = btrfs_alloc_chunk(trans, root, flags);
4187         }
4188
4189         if (!ret) {
4190                 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4191                                           &root->fs_info->chunk_block_rsv,
4192                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4193                 if (!ret)
4194                         trans->chunk_bytes_reserved += thresh;
4195         }
4196 }
4197
4198 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4199                           struct btrfs_root *extent_root, u64 flags, int force)
4200 {
4201         struct btrfs_space_info *space_info;
4202         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4203         int wait_for_alloc = 0;
4204         int ret = 0;
4205
4206         /* Don't re-enter if we're already allocating a chunk */
4207         if (trans->allocating_chunk)
4208                 return -ENOSPC;
4209
4210         space_info = __find_space_info(extent_root->fs_info, flags);
4211         if (!space_info) {
4212                 ret = update_space_info(extent_root->fs_info, flags,
4213                                         0, 0, &space_info);
4214                 BUG_ON(ret); /* -ENOMEM */
4215         }
4216         BUG_ON(!space_info); /* Logic error */
4217
4218 again:
4219         spin_lock(&space_info->lock);
4220         if (force < space_info->force_alloc)
4221                 force = space_info->force_alloc;
4222         if (space_info->full) {
4223                 if (should_alloc_chunk(extent_root, space_info, force))
4224                         ret = -ENOSPC;
4225                 else
4226                         ret = 0;
4227                 spin_unlock(&space_info->lock);
4228                 return ret;
4229         }
4230
4231         if (!should_alloc_chunk(extent_root, space_info, force)) {
4232                 spin_unlock(&space_info->lock);
4233                 return 0;
4234         } else if (space_info->chunk_alloc) {
4235                 wait_for_alloc = 1;
4236         } else {
4237                 space_info->chunk_alloc = 1;
4238         }
4239
4240         spin_unlock(&space_info->lock);
4241
4242         mutex_lock(&fs_info->chunk_mutex);
4243
4244         /*
4245          * The chunk_mutex is held throughout the entirety of a chunk
4246          * allocation, so once we've acquired the chunk_mutex we know that the
4247          * other guy is done and we need to recheck and see if we should
4248          * allocate.
4249          */
4250         if (wait_for_alloc) {
4251                 mutex_unlock(&fs_info->chunk_mutex);
4252                 wait_for_alloc = 0;
4253                 goto again;
4254         }
4255
4256         trans->allocating_chunk = true;
4257
4258         /*
4259          * If we have mixed data/metadata chunks we want to make sure we keep
4260          * allocating mixed chunks instead of individual chunks.
4261          */
4262         if (btrfs_mixed_space_info(space_info))
4263                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4264
4265         /*
4266          * if we're doing a data chunk, go ahead and make sure that
4267          * we keep a reasonable number of metadata chunks allocated in the
4268          * FS as well.
4269          */
4270         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4271                 fs_info->data_chunk_allocations++;
4272                 if (!(fs_info->data_chunk_allocations %
4273                       fs_info->metadata_ratio))
4274                         force_metadata_allocation(fs_info);
4275         }
4276
4277         /*
4278          * Check if we have enough space in SYSTEM chunk because we may need
4279          * to update devices.
4280          */
4281         check_system_chunk(trans, extent_root, flags);
4282
4283         ret = btrfs_alloc_chunk(trans, extent_root, flags);
4284         trans->allocating_chunk = false;
4285
4286         spin_lock(&space_info->lock);
4287         if (ret < 0 && ret != -ENOSPC)
4288                 goto out;
4289         if (ret)
4290                 space_info->full = 1;
4291         else
4292                 ret = 1;
4293
4294         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4295 out:
4296         space_info->chunk_alloc = 0;
4297         spin_unlock(&space_info->lock);
4298         mutex_unlock(&fs_info->chunk_mutex);
4299         /*
4300          * When we allocate a new chunk we reserve space in the chunk block
4301          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4302          * add new nodes/leafs to it if we end up needing to do it when
4303          * inserting the chunk item and updating device items as part of the
4304          * second phase of chunk allocation, performed by
4305          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4306          * large number of new block groups to create in our transaction
4307          * handle's new_bgs list to avoid exhausting the chunk block reserve
4308          * in extreme cases - like having a single transaction create many new
4309          * block groups when starting to write out the free space caches of all
4310          * the block groups that were made dirty during the lifetime of the
4311          * transaction.
4312          */
4313         if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
4314                 btrfs_create_pending_block_groups(trans, trans->root);
4315                 btrfs_trans_release_chunk_metadata(trans);
4316         }
4317         return ret;
4318 }
4319
4320 static int can_overcommit(struct btrfs_root *root,
4321                           struct btrfs_space_info *space_info, u64 bytes,
4322                           enum btrfs_reserve_flush_enum flush)
4323 {
4324         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4325         u64 profile = btrfs_get_alloc_profile(root, 0);
4326         u64 space_size;
4327         u64 avail;
4328         u64 used;
4329
4330         used = space_info->bytes_used + space_info->bytes_reserved +
4331                 space_info->bytes_pinned + space_info->bytes_readonly;
4332
4333         /*
4334          * We only want to allow over committing if we have lots of actual space
4335          * free, but if we don't have enough space to handle the global reserve
4336          * space then we could end up having a real enospc problem when trying
4337          * to allocate a chunk or some other such important allocation.
4338          */
4339         spin_lock(&global_rsv->lock);
4340         space_size = calc_global_rsv_need_space(global_rsv);
4341         spin_unlock(&global_rsv->lock);
4342         if (used + space_size >= space_info->total_bytes)
4343                 return 0;
4344
4345         used += space_info->bytes_may_use;
4346
4347         spin_lock(&root->fs_info->free_chunk_lock);
4348         avail = root->fs_info->free_chunk_space;
4349         spin_unlock(&root->fs_info->free_chunk_lock);
4350
4351         /*
4352          * If we have dup, raid1 or raid10 then only half of the free
4353          * space is actually useable.  For raid56, the space info used
4354          * doesn't include the parity drive, so we don't have to
4355          * change the math
4356          */
4357         if (profile & (BTRFS_BLOCK_GROUP_DUP |
4358                        BTRFS_BLOCK_GROUP_RAID1 |
4359                        BTRFS_BLOCK_GROUP_RAID10))
4360                 avail >>= 1;
4361
4362         /*
4363          * If we aren't flushing all things, let us overcommit up to
4364          * 1/2th of the space. If we can flush, don't let us overcommit
4365          * too much, let it overcommit up to 1/8 of the space.
4366          */
4367         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4368                 avail >>= 3;
4369         else
4370                 avail >>= 1;
4371
4372         if (used + bytes < space_info->total_bytes + avail)
4373                 return 1;
4374         return 0;
4375 }
4376
4377 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4378                                          unsigned long nr_pages, int nr_items)
4379 {
4380         struct super_block *sb = root->fs_info->sb;
4381
4382         if (down_read_trylock(&sb->s_umount)) {
4383                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4384                 up_read(&sb->s_umount);
4385         } else {
4386                 /*
4387                  * We needn't worry the filesystem going from r/w to r/o though
4388                  * we don't acquire ->s_umount mutex, because the filesystem
4389                  * should guarantee the delalloc inodes list be empty after
4390                  * the filesystem is readonly(all dirty pages are written to
4391                  * the disk).
4392                  */
4393                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4394                 if (!current->journal_info)
4395                         btrfs_wait_ordered_roots(root->fs_info, nr_items);
4396         }
4397 }
4398
4399 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4400 {
4401         u64 bytes;
4402         int nr;
4403
4404         bytes = btrfs_calc_trans_metadata_size(root, 1);
4405         nr = (int)div64_u64(to_reclaim, bytes);
4406         if (!nr)
4407                 nr = 1;
4408         return nr;
4409 }
4410
4411 #define EXTENT_SIZE_PER_ITEM    (256 * 1024)
4412
4413 /*
4414  * shrink metadata reservation for delalloc
4415  */
4416 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4417                             bool wait_ordered)
4418 {
4419         struct btrfs_block_rsv *block_rsv;
4420         struct btrfs_space_info *space_info;
4421         struct btrfs_trans_handle *trans;
4422         u64 delalloc_bytes;
4423         u64 max_reclaim;
4424         long time_left;
4425         unsigned long nr_pages;
4426         int loops;
4427         int items;
4428         enum btrfs_reserve_flush_enum flush;
4429
4430         /* Calc the number of the pages we need flush for space reservation */
4431         items = calc_reclaim_items_nr(root, to_reclaim);
4432         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4433
4434         trans = (struct btrfs_trans_handle *)current->journal_info;
4435         block_rsv = &root->fs_info->delalloc_block_rsv;
4436         space_info = block_rsv->space_info;
4437
4438         delalloc_bytes = percpu_counter_sum_positive(
4439                                                 &root->fs_info->delalloc_bytes);
4440         if (delalloc_bytes == 0) {
4441                 if (trans)
4442                         return;
4443                 if (wait_ordered)
4444                         btrfs_wait_ordered_roots(root->fs_info, items);
4445                 return;
4446         }
4447
4448         loops = 0;
4449         while (delalloc_bytes && loops < 3) {
4450                 max_reclaim = min(delalloc_bytes, to_reclaim);
4451                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4452                 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4453                 /*
4454                  * We need to wait for the async pages to actually start before
4455                  * we do anything.
4456                  */
4457                 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4458                 if (!max_reclaim)
4459                         goto skip_async;
4460
4461                 if (max_reclaim <= nr_pages)
4462                         max_reclaim = 0;
4463                 else
4464                         max_reclaim -= nr_pages;
4465
4466                 wait_event(root->fs_info->async_submit_wait,
4467                            atomic_read(&root->fs_info->async_delalloc_pages) <=
4468                            (int)max_reclaim);
4469 skip_async:
4470                 if (!trans)
4471                         flush = BTRFS_RESERVE_FLUSH_ALL;
4472                 else
4473                         flush = BTRFS_RESERVE_NO_FLUSH;
4474                 spin_lock(&space_info->lock);
4475                 if (can_overcommit(root, space_info, orig, flush)) {
4476                         spin_unlock(&space_info->lock);
4477                         break;
4478                 }
4479                 spin_unlock(&space_info->lock);
4480
4481                 loops++;
4482                 if (wait_ordered && !trans) {
4483                         btrfs_wait_ordered_roots(root->fs_info, items);
4484                 } else {
4485                         time_left = schedule_timeout_killable(1);
4486                         if (time_left)
4487                                 break;
4488                 }
4489                 delalloc_bytes = percpu_counter_sum_positive(
4490                                                 &root->fs_info->delalloc_bytes);
4491         }
4492 }
4493
4494 /**
4495  * maybe_commit_transaction - possibly commit the transaction if its ok to
4496  * @root - the root we're allocating for
4497  * @bytes - the number of bytes we want to reserve
4498  * @force - force the commit
4499  *
4500  * This will check to make sure that committing the transaction will actually
4501  * get us somewhere and then commit the transaction if it does.  Otherwise it
4502  * will return -ENOSPC.
4503  */
4504 static int may_commit_transaction(struct btrfs_root *root,
4505                                   struct btrfs_space_info *space_info,
4506                                   u64 bytes, int force)
4507 {
4508         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4509         struct btrfs_trans_handle *trans;
4510
4511         trans = (struct btrfs_trans_handle *)current->journal_info;
4512         if (trans)
4513                 return -EAGAIN;
4514
4515         if (force)
4516                 goto commit;
4517
4518         /* See if there is enough pinned space to make this reservation */
4519         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4520                                    bytes) >= 0)
4521                 goto commit;
4522
4523         /*
4524          * See if there is some space in the delayed insertion reservation for
4525          * this reservation.
4526          */
4527         if (space_info != delayed_rsv->space_info)
4528                 return -ENOSPC;
4529
4530         spin_lock(&delayed_rsv->lock);
4531         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4532                                    bytes - delayed_rsv->size) >= 0) {
4533                 spin_unlock(&delayed_rsv->lock);
4534                 return -ENOSPC;
4535         }
4536         spin_unlock(&delayed_rsv->lock);
4537
4538 commit:
4539         trans = btrfs_join_transaction(root);
4540         if (IS_ERR(trans))
4541                 return -ENOSPC;
4542
4543         return btrfs_commit_transaction(trans, root);
4544 }
4545
4546 enum flush_state {
4547         FLUSH_DELAYED_ITEMS_NR  =       1,
4548         FLUSH_DELAYED_ITEMS     =       2,
4549         FLUSH_DELALLOC          =       3,
4550         FLUSH_DELALLOC_WAIT     =       4,
4551         ALLOC_CHUNK             =       5,
4552         COMMIT_TRANS            =       6,
4553 };
4554
4555 static int flush_space(struct btrfs_root *root,
4556                        struct btrfs_space_info *space_info, u64 num_bytes,
4557                        u64 orig_bytes, int state)
4558 {
4559         struct btrfs_trans_handle *trans;
4560         int nr;
4561         int ret = 0;
4562
4563         switch (state) {
4564         case FLUSH_DELAYED_ITEMS_NR:
4565         case FLUSH_DELAYED_ITEMS:
4566                 if (state == FLUSH_DELAYED_ITEMS_NR)
4567                         nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4568                 else
4569                         nr = -1;
4570
4571                 trans = btrfs_join_transaction(root);
4572                 if (IS_ERR(trans)) {
4573                         ret = PTR_ERR(trans);
4574                         break;
4575                 }
4576                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4577                 btrfs_end_transaction(trans, root);
4578                 break;
4579         case FLUSH_DELALLOC:
4580         case FLUSH_DELALLOC_WAIT:
4581                 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4582                                 state == FLUSH_DELALLOC_WAIT);
4583                 break;
4584         case ALLOC_CHUNK:
4585                 trans = btrfs_join_transaction(root);
4586                 if (IS_ERR(trans)) {
4587                         ret = PTR_ERR(trans);
4588                         break;
4589                 }
4590                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4591                                      btrfs_get_alloc_profile(root, 0),
4592                                      CHUNK_ALLOC_NO_FORCE);
4593                 btrfs_end_transaction(trans, root);
4594                 if (ret == -ENOSPC)
4595                         ret = 0;
4596                 break;
4597         case COMMIT_TRANS:
4598                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4599                 break;
4600         default:
4601                 ret = -ENOSPC;
4602                 break;
4603         }
4604
4605         return ret;
4606 }
4607
4608 static inline u64
4609 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4610                                  struct btrfs_space_info *space_info)
4611 {
4612         u64 used;
4613         u64 expected;
4614         u64 to_reclaim;
4615
4616         to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4617                                 16 * 1024 * 1024);
4618         spin_lock(&space_info->lock);
4619         if (can_overcommit(root, space_info, to_reclaim,
4620                            BTRFS_RESERVE_FLUSH_ALL)) {
4621                 to_reclaim = 0;
4622                 goto out;
4623         }
4624
4625         used = space_info->bytes_used + space_info->bytes_reserved +
4626                space_info->bytes_pinned + space_info->bytes_readonly +
4627                space_info->bytes_may_use;
4628         if (can_overcommit(root, space_info, 1024 * 1024,
4629                            BTRFS_RESERVE_FLUSH_ALL))
4630                 expected = div_factor_fine(space_info->total_bytes, 95);
4631         else
4632                 expected = div_factor_fine(space_info->total_bytes, 90);
4633
4634         if (used > expected)
4635                 to_reclaim = used - expected;
4636         else
4637                 to_reclaim = 0;
4638         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4639                                      space_info->bytes_reserved);
4640 out:
4641         spin_unlock(&space_info->lock);
4642
4643         return to_reclaim;
4644 }
4645
4646 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4647                                         struct btrfs_fs_info *fs_info, u64 used)
4648 {
4649         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4650
4651         /* If we're just plain full then async reclaim just slows us down. */
4652         if (space_info->bytes_used >= thresh)
4653                 return 0;
4654
4655         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4656                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4657 }
4658
4659 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4660                                        struct btrfs_fs_info *fs_info,
4661                                        int flush_state)
4662 {
4663         u64 used;
4664
4665         spin_lock(&space_info->lock);
4666         /*
4667          * We run out of space and have not got any free space via flush_space,
4668          * so don't bother doing async reclaim.
4669          */
4670         if (flush_state > COMMIT_TRANS && space_info->full) {
4671                 spin_unlock(&space_info->lock);
4672                 return 0;
4673         }
4674
4675         used = space_info->bytes_used + space_info->bytes_reserved +
4676                space_info->bytes_pinned + space_info->bytes_readonly +
4677                space_info->bytes_may_use;
4678         if (need_do_async_reclaim(space_info, fs_info, used)) {
4679                 spin_unlock(&space_info->lock);
4680                 return 1;
4681         }
4682         spin_unlock(&space_info->lock);
4683
4684         return 0;
4685 }
4686
4687 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4688 {
4689         struct btrfs_fs_info *fs_info;
4690         struct btrfs_space_info *space_info;
4691         u64 to_reclaim;
4692         int flush_state;
4693
4694         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4695         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4696
4697         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4698                                                       space_info);
4699         if (!to_reclaim)
4700                 return;
4701
4702         flush_state = FLUSH_DELAYED_ITEMS_NR;
4703         do {
4704                 flush_space(fs_info->fs_root, space_info, to_reclaim,
4705                             to_reclaim, flush_state);
4706                 flush_state++;
4707                 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4708                                                  flush_state))
4709                         return;
4710         } while (flush_state < COMMIT_TRANS);
4711 }
4712
4713 void btrfs_init_async_reclaim_work(struct work_struct *work)
4714 {
4715         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4716 }
4717
4718 /**
4719  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4720  * @root - the root we're allocating for
4721  * @block_rsv - the block_rsv we're allocating for
4722  * @orig_bytes - the number of bytes we want
4723  * @flush - whether or not we can flush to make our reservation
4724  *
4725  * This will reserve orgi_bytes number of bytes from the space info associated
4726  * with the block_rsv.  If there is not enough space it will make an attempt to
4727  * flush out space to make room.  It will do this by flushing delalloc if
4728  * possible or committing the transaction.  If flush is 0 then no attempts to
4729  * regain reservations will be made and this will fail if there is not enough
4730  * space already.
4731  */
4732 static int reserve_metadata_bytes(struct btrfs_root *root,
4733                                   struct btrfs_block_rsv *block_rsv,
4734                                   u64 orig_bytes,
4735                                   enum btrfs_reserve_flush_enum flush)
4736 {
4737         struct btrfs_space_info *space_info = block_rsv->space_info;
4738         u64 used;
4739         u64 num_bytes = orig_bytes;
4740         int flush_state = FLUSH_DELAYED_ITEMS_NR;
4741         int ret = 0;
4742         bool flushing = false;
4743
4744 again:
4745         ret = 0;
4746         spin_lock(&space_info->lock);
4747         /*
4748          * We only want to wait if somebody other than us is flushing and we
4749          * are actually allowed to flush all things.
4750          */
4751         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4752                space_info->flush) {
4753                 spin_unlock(&space_info->lock);
4754                 /*
4755                  * If we have a trans handle we can't wait because the flusher
4756                  * may have to commit the transaction, which would mean we would
4757                  * deadlock since we are waiting for the flusher to finish, but
4758                  * hold the current transaction open.
4759                  */
4760                 if (current->journal_info)
4761                         return -EAGAIN;
4762                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4763                 /* Must have been killed, return */
4764                 if (ret)
4765                         return -EINTR;
4766
4767                 spin_lock(&space_info->lock);
4768         }
4769
4770         ret = -ENOSPC;
4771         used = space_info->bytes_used + space_info->bytes_reserved +
4772                 space_info->bytes_pinned + space_info->bytes_readonly +
4773                 space_info->bytes_may_use;
4774
4775         /*
4776          * The idea here is that we've not already over-reserved the block group
4777          * then we can go ahead and save our reservation first and then start
4778          * flushing if we need to.  Otherwise if we've already overcommitted
4779          * lets start flushing stuff first and then come back and try to make
4780          * our reservation.
4781          */
4782         if (used <= space_info->total_bytes) {
4783                 if (used + orig_bytes <= space_info->total_bytes) {
4784                         space_info->bytes_may_use += orig_bytes;
4785                         trace_btrfs_space_reservation(root->fs_info,
4786                                 "space_info", space_info->flags, orig_bytes, 1);
4787                         ret = 0;
4788                 } else {
4789                         /*
4790                          * Ok set num_bytes to orig_bytes since we aren't
4791                          * overocmmitted, this way we only try and reclaim what
4792                          * we need.
4793                          */
4794                         num_bytes = orig_bytes;
4795                 }
4796         } else {
4797                 /*
4798                  * Ok we're over committed, set num_bytes to the overcommitted
4799                  * amount plus the amount of bytes that we need for this
4800                  * reservation.
4801                  */
4802                 num_bytes = used - space_info->total_bytes +
4803                         (orig_bytes * 2);
4804         }
4805
4806         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4807                 space_info->bytes_may_use += orig_bytes;
4808                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4809                                               space_info->flags, orig_bytes,
4810                                               1);
4811                 ret = 0;
4812         }
4813
4814         /*
4815          * Couldn't make our reservation, save our place so while we're trying
4816          * to reclaim space we can actually use it instead of somebody else
4817          * stealing it from us.
4818          *
4819          * We make the other tasks wait for the flush only when we can flush
4820          * all things.
4821          */
4822         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4823                 flushing = true;
4824                 space_info->flush = 1;
4825         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4826                 used += orig_bytes;
4827                 /*
4828                  * We will do the space reservation dance during log replay,
4829                  * which means we won't have fs_info->fs_root set, so don't do
4830                  * the async reclaim as we will panic.
4831                  */
4832                 if (!root->fs_info->log_root_recovering &&
4833                     need_do_async_reclaim(space_info, root->fs_info, used) &&
4834                     !work_busy(&root->fs_info->async_reclaim_work))
4835                         queue_work(system_unbound_wq,
4836                                    &root->fs_info->async_reclaim_work);
4837         }
4838         spin_unlock(&space_info->lock);
4839
4840         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4841                 goto out;
4842
4843         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4844                           flush_state);
4845         flush_state++;
4846
4847         /*
4848          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4849          * would happen. So skip delalloc flush.
4850          */
4851         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4852             (flush_state == FLUSH_DELALLOC ||
4853              flush_state == FLUSH_DELALLOC_WAIT))
4854                 flush_state = ALLOC_CHUNK;
4855
4856         if (!ret)
4857                 goto again;
4858         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4859                  flush_state < COMMIT_TRANS)
4860                 goto again;
4861         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4862                  flush_state <= COMMIT_TRANS)
4863                 goto again;
4864
4865 out:
4866         if (ret == -ENOSPC &&
4867             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4868                 struct btrfs_block_rsv *global_rsv =
4869                         &root->fs_info->global_block_rsv;
4870
4871                 if (block_rsv != global_rsv &&
4872                     !block_rsv_use_bytes(global_rsv, orig_bytes))
4873                         ret = 0;
4874         }
4875         if (ret == -ENOSPC)
4876                 trace_btrfs_space_reservation(root->fs_info,
4877                                               "space_info:enospc",
4878                                               space_info->flags, orig_bytes, 1);
4879         if (flushing) {
4880                 spin_lock(&space_info->lock);
4881                 space_info->flush = 0;
4882                 wake_up_all(&space_info->wait);
4883                 spin_unlock(&space_info->lock);
4884         }
4885         return ret;
4886 }
4887
4888 static struct btrfs_block_rsv *get_block_rsv(
4889                                         const struct btrfs_trans_handle *trans,
4890                                         const struct btrfs_root *root)
4891 {
4892         struct btrfs_block_rsv *block_rsv = NULL;
4893
4894         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4895                 block_rsv = trans->block_rsv;
4896
4897         if (root == root->fs_info->csum_root && trans->adding_csums)
4898                 block_rsv = trans->block_rsv;
4899
4900         if (root == root->fs_info->uuid_root)
4901                 block_rsv = trans->block_rsv;
4902
4903         if (!block_rsv)
4904                 block_rsv = root->block_rsv;
4905
4906         if (!block_rsv)
4907                 block_rsv = &root->fs_info->empty_block_rsv;
4908
4909         return block_rsv;
4910 }
4911
4912 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4913                                u64 num_bytes)
4914 {
4915         int ret = -ENOSPC;
4916         spin_lock(&block_rsv->lock);
4917         if (block_rsv->reserved >= num_bytes) {
4918                 block_rsv->reserved -= num_bytes;
4919                 if (block_rsv->reserved < block_rsv->size)
4920                         block_rsv->full = 0;
4921                 ret = 0;
4922         }
4923         spin_unlock(&block_rsv->lock);
4924         return ret;
4925 }
4926
4927 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4928                                 u64 num_bytes, int update_size)
4929 {
4930         spin_lock(&block_rsv->lock);
4931         block_rsv->reserved += num_bytes;
4932         if (update_size)
4933                 block_rsv->size += num_bytes;
4934         else if (block_rsv->reserved >= block_rsv->size)
4935                 block_rsv->full = 1;
4936         spin_unlock(&block_rsv->lock);
4937 }
4938
4939 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4940                              struct btrfs_block_rsv *dest, u64 num_bytes,
4941                              int min_factor)
4942 {
4943         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4944         u64 min_bytes;
4945
4946         if (global_rsv->space_info != dest->space_info)
4947                 return -ENOSPC;
4948
4949         spin_lock(&global_rsv->lock);
4950         min_bytes = div_factor(global_rsv->size, min_factor);
4951         if (global_rsv->reserved < min_bytes + num_bytes) {
4952                 spin_unlock(&global_rsv->lock);
4953                 return -ENOSPC;
4954         }
4955         global_rsv->reserved -= num_bytes;
4956         if (global_rsv->reserved < global_rsv->size)
4957                 global_rsv->full = 0;
4958         spin_unlock(&global_rsv->lock);
4959
4960         block_rsv_add_bytes(dest, num_bytes, 1);
4961         return 0;
4962 }
4963
4964 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4965                                     struct btrfs_block_rsv *block_rsv,
4966                                     struct btrfs_block_rsv *dest, u64 num_bytes)
4967 {
4968         struct btrfs_space_info *space_info = block_rsv->space_info;
4969
4970         spin_lock(&block_rsv->lock);
4971         if (num_bytes == (u64)-1)
4972                 num_bytes = block_rsv->size;
4973         block_rsv->size -= num_bytes;
4974         if (block_rsv->reserved >= block_rsv->size) {
4975                 num_bytes = block_rsv->reserved - block_rsv->size;
4976                 block_rsv->reserved = block_rsv->size;
4977                 block_rsv->full = 1;
4978         } else {
4979                 num_bytes = 0;
4980         }
4981         spin_unlock(&block_rsv->lock);
4982
4983         if (num_bytes > 0) {
4984                 if (dest) {
4985                         spin_lock(&dest->lock);
4986                         if (!dest->full) {
4987                                 u64 bytes_to_add;
4988
4989                                 bytes_to_add = dest->size - dest->reserved;
4990                                 bytes_to_add = min(num_bytes, bytes_to_add);
4991                                 dest->reserved += bytes_to_add;
4992                                 if (dest->reserved >= dest->size)
4993                                         dest->full = 1;
4994                                 num_bytes -= bytes_to_add;
4995                         }
4996                         spin_unlock(&dest->lock);
4997                 }
4998                 if (num_bytes) {
4999                         spin_lock(&space_info->lock);
5000                         space_info->bytes_may_use -= num_bytes;
5001                         trace_btrfs_space_reservation(fs_info, "space_info",
5002                                         space_info->flags, num_bytes, 0);
5003                         spin_unlock(&space_info->lock);
5004                 }
5005         }
5006 }
5007
5008 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
5009                                    struct btrfs_block_rsv *dst, u64 num_bytes)
5010 {
5011         int ret;
5012
5013         ret = block_rsv_use_bytes(src, num_bytes);
5014         if (ret)
5015                 return ret;
5016
5017         block_rsv_add_bytes(dst, num_bytes, 1);
5018         return 0;
5019 }
5020
5021 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5022 {
5023         memset(rsv, 0, sizeof(*rsv));
5024         spin_lock_init(&rsv->lock);
5025         rsv->type = type;
5026 }
5027
5028 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5029                                               unsigned short type)
5030 {
5031         struct btrfs_block_rsv *block_rsv;
5032         struct btrfs_fs_info *fs_info = root->fs_info;
5033
5034         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5035         if (!block_rsv)
5036                 return NULL;
5037
5038         btrfs_init_block_rsv(block_rsv, type);
5039         block_rsv->space_info = __find_space_info(fs_info,
5040                                                   BTRFS_BLOCK_GROUP_METADATA);
5041         return block_rsv;
5042 }
5043
5044 void btrfs_free_block_rsv(struct btrfs_root *root,
5045                           struct btrfs_block_rsv *rsv)
5046 {
5047         if (!rsv)
5048                 return;
5049         btrfs_block_rsv_release(root, rsv, (u64)-1);
5050         kfree(rsv);
5051 }
5052
5053 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5054 {
5055         kfree(rsv);
5056 }
5057
5058 int btrfs_block_rsv_add(struct btrfs_root *root,
5059                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5060                         enum btrfs_reserve_flush_enum flush)
5061 {
5062         int ret;
5063
5064         if (num_bytes == 0)
5065                 return 0;
5066
5067         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5068         if (!ret) {
5069                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5070                 return 0;
5071         }
5072
5073         return ret;
5074 }
5075
5076 int btrfs_block_rsv_check(struct btrfs_root *root,
5077                           struct btrfs_block_rsv *block_rsv, int min_factor)
5078 {
5079         u64 num_bytes = 0;
5080         int ret = -ENOSPC;
5081
5082         if (!block_rsv)
5083                 return 0;
5084
5085         spin_lock(&block_rsv->lock);
5086         num_bytes = div_factor(block_rsv->size, min_factor);
5087         if (block_rsv->reserved >= num_bytes)
5088                 ret = 0;
5089         spin_unlock(&block_rsv->lock);
5090
5091         return ret;
5092 }
5093
5094 int btrfs_block_rsv_refill(struct btrfs_root *root,
5095                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5096                            enum btrfs_reserve_flush_enum flush)
5097 {
5098         u64 num_bytes = 0;
5099         int ret = -ENOSPC;
5100
5101         if (!block_rsv)
5102                 return 0;
5103
5104         spin_lock(&block_rsv->lock);
5105         num_bytes = min_reserved;
5106         if (block_rsv->reserved >= num_bytes)
5107                 ret = 0;
5108         else
5109                 num_bytes -= block_rsv->reserved;
5110         spin_unlock(&block_rsv->lock);
5111
5112         if (!ret)
5113                 return 0;
5114
5115         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5116         if (!ret) {
5117                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5118                 return 0;
5119         }
5120
5121         return ret;
5122 }
5123
5124 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
5125                             struct btrfs_block_rsv *dst_rsv,
5126                             u64 num_bytes)
5127 {
5128         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5129 }
5130
5131 void btrfs_block_rsv_release(struct btrfs_root *root,
5132                              struct btrfs_block_rsv *block_rsv,
5133                              u64 num_bytes)
5134 {
5135         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5136         if (global_rsv == block_rsv ||
5137             block_rsv->space_info != global_rsv->space_info)
5138                 global_rsv = NULL;
5139         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5140                                 num_bytes);
5141 }
5142
5143 /*
5144  * helper to calculate size of global block reservation.
5145  * the desired value is sum of space used by extent tree,
5146  * checksum tree and root tree
5147  */
5148 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
5149 {
5150         struct btrfs_space_info *sinfo;
5151         u64 num_bytes;
5152         u64 meta_used;
5153         u64 data_used;
5154         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
5155
5156         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
5157         spin_lock(&sinfo->lock);
5158         data_used = sinfo->bytes_used;
5159         spin_unlock(&sinfo->lock);
5160
5161         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5162         spin_lock(&sinfo->lock);
5163         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
5164                 data_used = 0;
5165         meta_used = sinfo->bytes_used;
5166         spin_unlock(&sinfo->lock);
5167
5168         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
5169                     csum_size * 2;
5170         num_bytes += div_u64(data_used + meta_used, 50);
5171
5172         if (num_bytes * 3 > meta_used)
5173                 num_bytes = div_u64(meta_used, 3);
5174
5175         return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
5176 }
5177
5178 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5179 {
5180         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5181         struct btrfs_space_info *sinfo = block_rsv->space_info;
5182         u64 num_bytes;
5183
5184         num_bytes = calc_global_metadata_size(fs_info);
5185
5186         spin_lock(&sinfo->lock);
5187         spin_lock(&block_rsv->lock);
5188
5189         block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
5190
5191         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5192                     sinfo->bytes_reserved + sinfo->bytes_readonly +
5193                     sinfo->bytes_may_use;
5194
5195         if (sinfo->total_bytes > num_bytes) {
5196                 num_bytes = sinfo->total_bytes - num_bytes;
5197                 block_rsv->reserved += num_bytes;
5198                 sinfo->bytes_may_use += num_bytes;
5199                 trace_btrfs_space_reservation(fs_info, "space_info",
5200                                       sinfo->flags, num_bytes, 1);
5201         }
5202
5203         if (block_rsv->reserved >= block_rsv->size) {
5204                 num_bytes = block_rsv->reserved - block_rsv->size;
5205                 sinfo->bytes_may_use -= num_bytes;
5206                 trace_btrfs_space_reservation(fs_info, "space_info",
5207                                       sinfo->flags, num_bytes, 0);
5208                 block_rsv->reserved = block_rsv->size;
5209                 block_rsv->full = 1;
5210         }
5211
5212         spin_unlock(&block_rsv->lock);
5213         spin_unlock(&sinfo->lock);
5214 }
5215
5216 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5217 {
5218         struct btrfs_space_info *space_info;
5219
5220         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5221         fs_info->chunk_block_rsv.space_info = space_info;
5222
5223         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5224         fs_info->global_block_rsv.space_info = space_info;
5225         fs_info->delalloc_block_rsv.space_info = space_info;
5226         fs_info->trans_block_rsv.space_info = space_info;
5227         fs_info->empty_block_rsv.space_info = space_info;
5228         fs_info->delayed_block_rsv.space_info = space_info;
5229
5230         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5231         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5232         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5233         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5234         if (fs_info->quota_root)
5235                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5236         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5237
5238         update_global_block_rsv(fs_info);
5239 }
5240
5241 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5242 {
5243         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5244                                 (u64)-1);
5245         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5246         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5247         WARN_ON(fs_info->trans_block_rsv.size > 0);
5248         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5249         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5250         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5251         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5252         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5253 }
5254
5255 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5256                                   struct btrfs_root *root)
5257 {
5258         if (!trans->block_rsv)
5259                 return;
5260
5261         if (!trans->bytes_reserved)
5262                 return;
5263
5264         trace_btrfs_space_reservation(root->fs_info, "transaction",
5265                                       trans->transid, trans->bytes_reserved, 0);
5266         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5267         trans->bytes_reserved = 0;
5268 }
5269
5270 /*
5271  * To be called after all the new block groups attached to the transaction
5272  * handle have been created (btrfs_create_pending_block_groups()).
5273  */
5274 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5275 {
5276         struct btrfs_fs_info *fs_info = trans->root->fs_info;
5277
5278         if (!trans->chunk_bytes_reserved)
5279                 return;
5280
5281         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5282
5283         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5284                                 trans->chunk_bytes_reserved);
5285         trans->chunk_bytes_reserved = 0;
5286 }
5287
5288 /* Can only return 0 or -ENOSPC */
5289 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5290                                   struct inode *inode)
5291 {
5292         struct btrfs_root *root = BTRFS_I(inode)->root;
5293         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
5294         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5295
5296         /*
5297          * We need to hold space in order to delete our orphan item once we've
5298          * added it, so this takes the reservation so we can release it later
5299          * when we are truly done with the orphan item.
5300          */
5301         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5302         trace_btrfs_space_reservation(root->fs_info, "orphan",
5303                                       btrfs_ino(inode), num_bytes, 1);
5304         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5305 }
5306
5307 void btrfs_orphan_release_metadata(struct inode *inode)
5308 {
5309         struct btrfs_root *root = BTRFS_I(inode)->root;
5310         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5311         trace_btrfs_space_reservation(root->fs_info, "orphan",
5312                                       btrfs_ino(inode), num_bytes, 0);
5313         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5314 }
5315
5316 /*
5317  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5318  * root: the root of the parent directory
5319  * rsv: block reservation
5320  * items: the number of items that we need do reservation
5321  * qgroup_reserved: used to return the reserved size in qgroup
5322  *
5323  * This function is used to reserve the space for snapshot/subvolume
5324  * creation and deletion. Those operations are different with the
5325  * common file/directory operations, they change two fs/file trees
5326  * and root tree, the number of items that the qgroup reserves is
5327  * different with the free space reservation. So we can not use
5328  * the space reseravtion mechanism in start_transaction().
5329  */
5330 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5331                                      struct btrfs_block_rsv *rsv,
5332                                      int items,
5333                                      u64 *qgroup_reserved,
5334                                      bool use_global_rsv)
5335 {
5336         u64 num_bytes;
5337         int ret;
5338         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5339
5340         if (root->fs_info->quota_enabled) {
5341                 /* One for parent inode, two for dir entries */
5342                 num_bytes = 3 * root->nodesize;
5343                 ret = btrfs_qgroup_reserve(root, num_bytes);
5344                 if (ret)
5345                         return ret;
5346         } else {
5347                 num_bytes = 0;
5348         }
5349
5350         *qgroup_reserved = num_bytes;
5351
5352         num_bytes = btrfs_calc_trans_metadata_size(root, items);
5353         rsv->space_info = __find_space_info(root->fs_info,
5354                                             BTRFS_BLOCK_GROUP_METADATA);
5355         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5356                                   BTRFS_RESERVE_FLUSH_ALL);
5357
5358         if (ret == -ENOSPC && use_global_rsv)
5359                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
5360
5361         if (ret) {
5362                 if (*qgroup_reserved)
5363                         btrfs_qgroup_free(root, *qgroup_reserved);
5364         }
5365
5366         return ret;
5367 }
5368
5369 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5370                                       struct btrfs_block_rsv *rsv,
5371                                       u64 qgroup_reserved)
5372 {
5373         btrfs_block_rsv_release(root, rsv, (u64)-1);
5374 }
5375
5376 /**
5377  * drop_outstanding_extent - drop an outstanding extent
5378  * @inode: the inode we're dropping the extent for
5379  * @num_bytes: the number of bytes we're relaseing.
5380  *
5381  * This is called when we are freeing up an outstanding extent, either called
5382  * after an error or after an extent is written.  This will return the number of
5383  * reserved extents that need to be freed.  This must be called with
5384  * BTRFS_I(inode)->lock held.
5385  */
5386 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5387 {
5388         unsigned drop_inode_space = 0;
5389         unsigned dropped_extents = 0;
5390         unsigned num_extents = 0;
5391
5392         num_extents = (unsigned)div64_u64(num_bytes +
5393                                           BTRFS_MAX_EXTENT_SIZE - 1,
5394                                           BTRFS_MAX_EXTENT_SIZE);
5395         ASSERT(num_extents);
5396         ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5397         BTRFS_I(inode)->outstanding_extents -= num_extents;
5398
5399         if (BTRFS_I(inode)->outstanding_extents == 0 &&
5400             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5401                                &BTRFS_I(inode)->runtime_flags))
5402                 drop_inode_space = 1;
5403
5404         /*
5405          * If we have more or the same amount of outsanding extents than we have
5406          * reserved then we need to leave the reserved extents count alone.
5407          */
5408         if (BTRFS_I(inode)->outstanding_extents >=
5409             BTRFS_I(inode)->reserved_extents)
5410                 return drop_inode_space;
5411
5412         dropped_extents = BTRFS_I(inode)->reserved_extents -
5413                 BTRFS_I(inode)->outstanding_extents;
5414         BTRFS_I(inode)->reserved_extents -= dropped_extents;
5415         return dropped_extents + drop_inode_space;
5416 }
5417
5418 /**
5419  * calc_csum_metadata_size - return the amount of metada space that must be
5420  *      reserved/free'd for the given bytes.
5421  * @inode: the inode we're manipulating
5422  * @num_bytes: the number of bytes in question
5423  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5424  *
5425  * This adjusts the number of csum_bytes in the inode and then returns the
5426  * correct amount of metadata that must either be reserved or freed.  We
5427  * calculate how many checksums we can fit into one leaf and then divide the
5428  * number of bytes that will need to be checksumed by this value to figure out
5429  * how many checksums will be required.  If we are adding bytes then the number
5430  * may go up and we will return the number of additional bytes that must be
5431  * reserved.  If it is going down we will return the number of bytes that must
5432  * be freed.
5433  *
5434  * This must be called with BTRFS_I(inode)->lock held.
5435  */
5436 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5437                                    int reserve)
5438 {
5439         struct btrfs_root *root = BTRFS_I(inode)->root;
5440         u64 old_csums, num_csums;
5441
5442         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5443             BTRFS_I(inode)->csum_bytes == 0)
5444                 return 0;
5445
5446         old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5447         if (reserve)
5448                 BTRFS_I(inode)->csum_bytes += num_bytes;
5449         else
5450                 BTRFS_I(inode)->csum_bytes -= num_bytes;
5451         num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5452
5453         /* No change, no need to reserve more */
5454         if (old_csums == num_csums)
5455                 return 0;
5456
5457         if (reserve)
5458                 return btrfs_calc_trans_metadata_size(root,
5459                                                       num_csums - old_csums);
5460
5461         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5462 }
5463
5464 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5465 {
5466         struct btrfs_root *root = BTRFS_I(inode)->root;
5467         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5468         u64 to_reserve = 0;
5469         u64 csum_bytes;
5470         unsigned nr_extents = 0;
5471         int extra_reserve = 0;
5472         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5473         int ret = 0;
5474         bool delalloc_lock = true;
5475         u64 to_free = 0;
5476         unsigned dropped;
5477
5478         /* If we are a free space inode we need to not flush since we will be in
5479          * the middle of a transaction commit.  We also don't need the delalloc
5480          * mutex since we won't race with anybody.  We need this mostly to make
5481          * lockdep shut its filthy mouth.
5482          */
5483         if (btrfs_is_free_space_inode(inode)) {
5484                 flush = BTRFS_RESERVE_NO_FLUSH;
5485                 delalloc_lock = false;
5486         }
5487
5488         if (flush != BTRFS_RESERVE_NO_FLUSH &&
5489             btrfs_transaction_in_commit(root->fs_info))
5490                 schedule_timeout(1);
5491
5492         if (delalloc_lock)
5493                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5494
5495         num_bytes = ALIGN(num_bytes, root->sectorsize);
5496
5497         spin_lock(&BTRFS_I(inode)->lock);
5498         nr_extents = (unsigned)div64_u64(num_bytes +
5499                                          BTRFS_MAX_EXTENT_SIZE - 1,
5500                                          BTRFS_MAX_EXTENT_SIZE);
5501         BTRFS_I(inode)->outstanding_extents += nr_extents;
5502         nr_extents = 0;
5503
5504         if (BTRFS_I(inode)->outstanding_extents >
5505             BTRFS_I(inode)->reserved_extents)
5506                 nr_extents = BTRFS_I(inode)->outstanding_extents -
5507                         BTRFS_I(inode)->reserved_extents;
5508
5509         /*
5510          * Add an item to reserve for updating the inode when we complete the
5511          * delalloc io.
5512          */
5513         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5514                       &BTRFS_I(inode)->runtime_flags)) {
5515                 nr_extents++;
5516                 extra_reserve = 1;
5517         }
5518
5519         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
5520         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5521         csum_bytes = BTRFS_I(inode)->csum_bytes;
5522         spin_unlock(&BTRFS_I(inode)->lock);
5523
5524         if (root->fs_info->quota_enabled) {
5525                 ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
5526                 if (ret)
5527                         goto out_fail;
5528         }
5529
5530         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5531         if (unlikely(ret)) {
5532                 if (root->fs_info->quota_enabled)
5533                         btrfs_qgroup_free(root, nr_extents * root->nodesize);
5534                 goto out_fail;
5535         }
5536
5537         spin_lock(&BTRFS_I(inode)->lock);
5538         if (extra_reserve) {
5539                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5540                         &BTRFS_I(inode)->runtime_flags);
5541                 nr_extents--;
5542         }
5543         BTRFS_I(inode)->reserved_extents += nr_extents;
5544         spin_unlock(&BTRFS_I(inode)->lock);
5545
5546         if (delalloc_lock)
5547                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5548
5549         if (to_reserve)
5550                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5551                                               btrfs_ino(inode), to_reserve, 1);
5552         block_rsv_add_bytes(block_rsv, to_reserve, 1);
5553
5554         return 0;
5555
5556 out_fail:
5557         spin_lock(&BTRFS_I(inode)->lock);
5558         dropped = drop_outstanding_extent(inode, num_bytes);
5559         /*
5560          * If the inodes csum_bytes is the same as the original
5561          * csum_bytes then we know we haven't raced with any free()ers
5562          * so we can just reduce our inodes csum bytes and carry on.
5563          */
5564         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5565                 calc_csum_metadata_size(inode, num_bytes, 0);
5566         } else {
5567                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5568                 u64 bytes;
5569
5570                 /*
5571                  * This is tricky, but first we need to figure out how much we
5572                  * free'd from any free-ers that occured during this
5573                  * reservation, so we reset ->csum_bytes to the csum_bytes
5574                  * before we dropped our lock, and then call the free for the
5575                  * number of bytes that were freed while we were trying our
5576                  * reservation.
5577                  */
5578                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5579                 BTRFS_I(inode)->csum_bytes = csum_bytes;
5580                 to_free = calc_csum_metadata_size(inode, bytes, 0);
5581
5582
5583                 /*
5584                  * Now we need to see how much we would have freed had we not
5585                  * been making this reservation and our ->csum_bytes were not
5586                  * artificially inflated.
5587                  */
5588                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5589                 bytes = csum_bytes - orig_csum_bytes;
5590                 bytes = calc_csum_metadata_size(inode, bytes, 0);
5591
5592                 /*
5593                  * Now reset ->csum_bytes to what it should be.  If bytes is
5594                  * more than to_free then we would have free'd more space had we
5595                  * not had an artificially high ->csum_bytes, so we need to free
5596                  * the remainder.  If bytes is the same or less then we don't
5597                  * need to do anything, the other free-ers did the correct
5598                  * thing.
5599                  */
5600                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5601                 if (bytes > to_free)
5602                         to_free = bytes - to_free;
5603                 else
5604                         to_free = 0;
5605         }
5606         spin_unlock(&BTRFS_I(inode)->lock);
5607         if (dropped)
5608                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5609
5610         if (to_free) {
5611                 btrfs_block_rsv_release(root, block_rsv, to_free);
5612                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5613                                               btrfs_ino(inode), to_free, 0);
5614         }
5615         if (delalloc_lock)
5616                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5617         return ret;
5618 }
5619
5620 /**
5621  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5622  * @inode: the inode to release the reservation for
5623  * @num_bytes: the number of bytes we're releasing
5624  *
5625  * This will release the metadata reservation for an inode.  This can be called
5626  * once we complete IO for a given set of bytes to release their metadata
5627  * reservations.
5628  */
5629 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5630 {
5631         struct btrfs_root *root = BTRFS_I(inode)->root;
5632         u64 to_free = 0;
5633         unsigned dropped;
5634
5635         num_bytes = ALIGN(num_bytes, root->sectorsize);
5636         spin_lock(&BTRFS_I(inode)->lock);
5637         dropped = drop_outstanding_extent(inode, num_bytes);
5638
5639         if (num_bytes)
5640                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5641         spin_unlock(&BTRFS_I(inode)->lock);
5642         if (dropped > 0)
5643                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5644
5645         if (btrfs_test_is_dummy_root(root))
5646                 return;
5647
5648         trace_btrfs_space_reservation(root->fs_info, "delalloc",
5649                                       btrfs_ino(inode), to_free, 0);
5650
5651         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5652                                 to_free);
5653 }
5654
5655 /**
5656  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5657  * @inode: inode we're writing to
5658  * @num_bytes: the number of bytes we want to allocate
5659  *
5660  * This will do the following things
5661  *
5662  * o reserve space in the data space info for num_bytes
5663  * o reserve space in the metadata space info based on number of outstanding
5664  *   extents and how much csums will be needed
5665  * o add to the inodes ->delalloc_bytes
5666  * o add it to the fs_info's delalloc inodes list.
5667  *
5668  * This will return 0 for success and -ENOSPC if there is no space left.
5669  */
5670 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5671 {
5672         int ret;
5673
5674         ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
5675         if (ret)
5676                 return ret;
5677
5678         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5679         if (ret) {
5680                 btrfs_free_reserved_data_space(inode, num_bytes);
5681                 return ret;
5682         }
5683
5684         return 0;
5685 }
5686
5687 /**
5688  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5689  * @inode: inode we're releasing space for
5690  * @num_bytes: the number of bytes we want to free up
5691  *
5692  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5693  * called in the case that we don't need the metadata AND data reservations
5694  * anymore.  So if there is an error or we insert an inline extent.
5695  *
5696  * This function will release the metadata space that was not used and will
5697  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5698  * list if there are no delalloc bytes left.
5699  */
5700 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5701 {
5702         btrfs_delalloc_release_metadata(inode, num_bytes);
5703         btrfs_free_reserved_data_space(inode, num_bytes);
5704 }
5705
5706 static int update_block_group(struct btrfs_trans_handle *trans,
5707                               struct btrfs_root *root, u64 bytenr,
5708                               u64 num_bytes, int alloc)
5709 {
5710         struct btrfs_block_group_cache *cache = NULL;
5711         struct btrfs_fs_info *info = root->fs_info;
5712         u64 total = num_bytes;
5713         u64 old_val;
5714         u64 byte_in_group;
5715         int factor;
5716
5717         /* block accounting for super block */
5718         spin_lock(&info->delalloc_root_lock);
5719         old_val = btrfs_super_bytes_used(info->super_copy);
5720         if (alloc)
5721                 old_val += num_bytes;
5722         else
5723                 old_val -= num_bytes;
5724         btrfs_set_super_bytes_used(info->super_copy, old_val);
5725         spin_unlock(&info->delalloc_root_lock);
5726
5727         while (total) {
5728                 cache = btrfs_lookup_block_group(info, bytenr);
5729                 if (!cache)
5730                         return -ENOENT;
5731                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5732                                     BTRFS_BLOCK_GROUP_RAID1 |
5733                                     BTRFS_BLOCK_GROUP_RAID10))
5734                         factor = 2;
5735                 else
5736                         factor = 1;
5737                 /*
5738                  * If this block group has free space cache written out, we
5739                  * need to make sure to load it if we are removing space.  This
5740                  * is because we need the unpinning stage to actually add the
5741                  * space back to the block group, otherwise we will leak space.
5742                  */
5743                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5744                         cache_block_group(cache, 1);
5745
5746                 byte_in_group = bytenr - cache->key.objectid;
5747                 WARN_ON(byte_in_group > cache->key.offset);
5748
5749                 spin_lock(&cache->space_info->lock);
5750                 spin_lock(&cache->lock);
5751
5752                 if (btrfs_test_opt(root, SPACE_CACHE) &&
5753                     cache->disk_cache_state < BTRFS_DC_CLEAR)
5754                         cache->disk_cache_state = BTRFS_DC_CLEAR;
5755
5756                 old_val = btrfs_block_group_used(&cache->item);
5757                 num_bytes = min(total, cache->key.offset - byte_in_group);
5758                 if (alloc) {
5759                         old_val += num_bytes;
5760                         btrfs_set_block_group_used(&cache->item, old_val);
5761                         cache->reserved -= num_bytes;
5762                         cache->space_info->bytes_reserved -= num_bytes;
5763                         cache->space_info->bytes_used += num_bytes;
5764                         cache->space_info->disk_used += num_bytes * factor;
5765                         spin_unlock(&cache->lock);
5766                         spin_unlock(&cache->space_info->lock);
5767                 } else {
5768                         old_val -= num_bytes;
5769                         btrfs_set_block_group_used(&cache->item, old_val);
5770                         cache->pinned += num_bytes;
5771                         cache->space_info->bytes_pinned += num_bytes;
5772                         cache->space_info->bytes_used -= num_bytes;
5773                         cache->space_info->disk_used -= num_bytes * factor;
5774                         spin_unlock(&cache->lock);
5775                         spin_unlock(&cache->space_info->lock);
5776
5777                         set_extent_dirty(info->pinned_extents,
5778                                          bytenr, bytenr + num_bytes - 1,
5779                                          GFP_NOFS | __GFP_NOFAIL);
5780                         /*
5781                          * No longer have used bytes in this block group, queue
5782                          * it for deletion.
5783                          */
5784                         if (old_val == 0) {
5785                                 spin_lock(&info->unused_bgs_lock);
5786                                 if (list_empty(&cache->bg_list)) {
5787                                         btrfs_get_block_group(cache);
5788                                         list_add_tail(&cache->bg_list,
5789                                                       &info->unused_bgs);
5790                                 }
5791                                 spin_unlock(&info->unused_bgs_lock);
5792                         }
5793                 }
5794
5795                 spin_lock(&trans->transaction->dirty_bgs_lock);
5796                 if (list_empty(&cache->dirty_list)) {
5797                         list_add_tail(&cache->dirty_list,
5798                                       &trans->transaction->dirty_bgs);
5799                                 trans->transaction->num_dirty_bgs++;
5800                         btrfs_get_block_group(cache);
5801                 }
5802                 spin_unlock(&trans->transaction->dirty_bgs_lock);
5803
5804                 btrfs_put_block_group(cache);
5805                 total -= num_bytes;
5806                 bytenr += num_bytes;
5807         }
5808         return 0;
5809 }
5810
5811 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5812 {
5813         struct btrfs_block_group_cache *cache;
5814         u64 bytenr;
5815
5816         spin_lock(&root->fs_info->block_group_cache_lock);
5817         bytenr = root->fs_info->first_logical_byte;
5818         spin_unlock(&root->fs_info->block_group_cache_lock);
5819
5820         if (bytenr < (u64)-1)
5821                 return bytenr;
5822
5823         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5824         if (!cache)
5825                 return 0;
5826
5827         bytenr = cache->key.objectid;
5828         btrfs_put_block_group(cache);
5829
5830         return bytenr;
5831 }
5832
5833 static int pin_down_extent(struct btrfs_root *root,
5834                            struct btrfs_block_group_cache *cache,
5835                            u64 bytenr, u64 num_bytes, int reserved)
5836 {
5837         spin_lock(&cache->space_info->lock);
5838         spin_lock(&cache->lock);
5839         cache->pinned += num_bytes;
5840         cache->space_info->bytes_pinned += num_bytes;
5841         if (reserved) {
5842                 cache->reserved -= num_bytes;
5843                 cache->space_info->bytes_reserved -= num_bytes;
5844         }
5845         spin_unlock(&cache->lock);
5846         spin_unlock(&cache->space_info->lock);
5847
5848         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5849                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5850         if (reserved)
5851                 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
5852         return 0;
5853 }
5854
5855 /*
5856  * this function must be called within transaction
5857  */
5858 int btrfs_pin_extent(struct btrfs_root *root,
5859                      u64 bytenr, u64 num_bytes, int reserved)
5860 {
5861         struct btrfs_block_group_cache *cache;
5862
5863         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5864         BUG_ON(!cache); /* Logic error */
5865
5866         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5867
5868         btrfs_put_block_group(cache);
5869         return 0;
5870 }
5871
5872 /*
5873  * this function must be called within transaction
5874  */
5875 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5876                                     u64 bytenr, u64 num_bytes)
5877 {
5878         struct btrfs_block_group_cache *cache;
5879         int ret;
5880
5881         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5882         if (!cache)
5883                 return -EINVAL;
5884
5885         /*
5886          * pull in the free space cache (if any) so that our pin
5887          * removes the free space from the cache.  We have load_only set
5888          * to one because the slow code to read in the free extents does check
5889          * the pinned extents.
5890          */
5891         cache_block_group(cache, 1);
5892
5893         pin_down_extent(root, cache, bytenr, num_bytes, 0);
5894
5895         /* remove us from the free space cache (if we're there at all) */
5896         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5897         btrfs_put_block_group(cache);
5898         return ret;
5899 }
5900
5901 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5902 {
5903         int ret;
5904         struct btrfs_block_group_cache *block_group;
5905         struct btrfs_caching_control *caching_ctl;
5906
5907         block_group = btrfs_lookup_block_group(root->fs_info, start);
5908         if (!block_group)
5909                 return -EINVAL;
5910
5911         cache_block_group(block_group, 0);
5912         caching_ctl = get_caching_control(block_group);
5913
5914         if (!caching_ctl) {
5915                 /* Logic error */
5916                 BUG_ON(!block_group_cache_done(block_group));
5917                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5918         } else {
5919                 mutex_lock(&caching_ctl->mutex);
5920
5921                 if (start >= caching_ctl->progress) {
5922                         ret = add_excluded_extent(root, start, num_bytes);
5923                 } else if (start + num_bytes <= caching_ctl->progress) {
5924                         ret = btrfs_remove_free_space(block_group,
5925                                                       start, num_bytes);
5926                 } else {
5927                         num_bytes = caching_ctl->progress - start;
5928                         ret = btrfs_remove_free_space(block_group,
5929                                                       start, num_bytes);
5930                         if (ret)
5931                                 goto out_lock;
5932
5933                         num_bytes = (start + num_bytes) -
5934                                 caching_ctl->progress;
5935                         start = caching_ctl->progress;
5936                         ret = add_excluded_extent(root, start, num_bytes);
5937                 }
5938 out_lock:
5939                 mutex_unlock(&caching_ctl->mutex);
5940                 put_caching_control(caching_ctl);
5941         }
5942         btrfs_put_block_group(block_group);
5943         return ret;
5944 }
5945
5946 int btrfs_exclude_logged_extents(struct btrfs_root *log,
5947                                  struct extent_buffer *eb)
5948 {
5949         struct btrfs_file_extent_item *item;
5950         struct btrfs_key key;
5951         int found_type;
5952         int i;
5953
5954         if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5955                 return 0;
5956
5957         for (i = 0; i < btrfs_header_nritems(eb); i++) {
5958                 btrfs_item_key_to_cpu(eb, &key, i);
5959                 if (key.type != BTRFS_EXTENT_DATA_KEY)
5960                         continue;
5961                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5962                 found_type = btrfs_file_extent_type(eb, item);
5963                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5964                         continue;
5965                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5966                         continue;
5967                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5968                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5969                 __exclude_logged_extent(log, key.objectid, key.offset);
5970         }
5971
5972         return 0;
5973 }
5974
5975 /**
5976  * btrfs_update_reserved_bytes - update the block_group and space info counters
5977  * @cache:      The cache we are manipulating
5978  * @num_bytes:  The number of bytes in question
5979  * @reserve:    One of the reservation enums
5980  * @delalloc:   The blocks are allocated for the delalloc write
5981  *
5982  * This is called by the allocator when it reserves space, or by somebody who is
5983  * freeing space that was never actually used on disk.  For example if you
5984  * reserve some space for a new leaf in transaction A and before transaction A
5985  * commits you free that leaf, you call this with reserve set to 0 in order to
5986  * clear the reservation.
5987  *
5988  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5989  * ENOSPC accounting.  For data we handle the reservation through clearing the
5990  * delalloc bits in the io_tree.  We have to do this since we could end up
5991  * allocating less disk space for the amount of data we have reserved in the
5992  * case of compression.
5993  *
5994  * If this is a reservation and the block group has become read only we cannot
5995  * make the reservation and return -EAGAIN, otherwise this function always
5996  * succeeds.
5997  */
5998 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5999                                        u64 num_bytes, int reserve, int delalloc)
6000 {
6001         struct btrfs_space_info *space_info = cache->space_info;
6002         int ret = 0;
6003
6004         spin_lock(&space_info->lock);
6005         spin_lock(&cache->lock);
6006         if (reserve != RESERVE_FREE) {
6007                 if (cache->ro) {
6008                         ret = -EAGAIN;
6009                 } else {
6010                         cache->reserved += num_bytes;
6011                         space_info->bytes_reserved += num_bytes;
6012                         if (reserve == RESERVE_ALLOC) {
6013                                 trace_btrfs_space_reservation(cache->fs_info,
6014                                                 "space_info", space_info->flags,
6015                                                 num_bytes, 0);
6016                                 space_info->bytes_may_use -= num_bytes;
6017                         }
6018
6019                         if (delalloc)
6020                                 cache->delalloc_bytes += num_bytes;
6021                 }
6022         } else {
6023                 if (cache->ro)
6024                         space_info->bytes_readonly += num_bytes;
6025                 cache->reserved -= num_bytes;
6026                 space_info->bytes_reserved -= num_bytes;
6027
6028                 if (delalloc)
6029                         cache->delalloc_bytes -= num_bytes;
6030         }
6031         spin_unlock(&cache->lock);
6032         spin_unlock(&space_info->lock);
6033         return ret;
6034 }
6035
6036 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6037                                 struct btrfs_root *root)
6038 {
6039         struct btrfs_fs_info *fs_info = root->fs_info;
6040         struct btrfs_caching_control *next;
6041         struct btrfs_caching_control *caching_ctl;
6042         struct btrfs_block_group_cache *cache;
6043
6044         down_write(&fs_info->commit_root_sem);
6045
6046         list_for_each_entry_safe(caching_ctl, next,
6047                                  &fs_info->caching_block_groups, list) {
6048                 cache = caching_ctl->block_group;
6049                 if (block_group_cache_done(cache)) {
6050                         cache->last_byte_to_unpin = (u64)-1;
6051                         list_del_init(&caching_ctl->list);
6052                         put_caching_control(caching_ctl);
6053                 } else {
6054                         cache->last_byte_to_unpin = caching_ctl->progress;
6055                 }
6056         }
6057
6058         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6059                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6060         else
6061                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6062
6063         up_write(&fs_info->commit_root_sem);
6064
6065         update_global_block_rsv(fs_info);
6066 }
6067
6068 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6069                               const bool return_free_space)
6070 {
6071         struct btrfs_fs_info *fs_info = root->fs_info;
6072         struct btrfs_block_group_cache *cache = NULL;
6073         struct btrfs_space_info *space_info;
6074         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6075         u64 len;
6076         bool readonly;
6077
6078         while (start <= end) {
6079                 readonly = false;
6080                 if (!cache ||
6081                     start >= cache->key.objectid + cache->key.offset) {
6082                         if (cache)
6083                                 btrfs_put_block_group(cache);
6084                         cache = btrfs_lookup_block_group(fs_info, start);
6085                         BUG_ON(!cache); /* Logic error */
6086                 }
6087
6088                 len = cache->key.objectid + cache->key.offset - start;
6089                 len = min(len, end + 1 - start);
6090
6091                 if (start < cache->last_byte_to_unpin) {
6092                         len = min(len, cache->last_byte_to_unpin - start);
6093                         if (return_free_space)
6094                                 btrfs_add_free_space(cache, start, len);
6095                 }
6096
6097                 start += len;
6098                 space_info = cache->space_info;
6099
6100                 spin_lock(&space_info->lock);
6101                 spin_lock(&cache->lock);
6102                 cache->pinned -= len;
6103                 space_info->bytes_pinned -= len;
6104                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6105                 if (cache->ro) {
6106                         space_info->bytes_readonly += len;
6107                         readonly = true;
6108                 }
6109                 spin_unlock(&cache->lock);
6110                 if (!readonly && global_rsv->space_info == space_info) {
6111                         spin_lock(&global_rsv->lock);
6112                         if (!global_rsv->full) {
6113                                 len = min(len, global_rsv->size -
6114                                           global_rsv->reserved);
6115                                 global_rsv->reserved += len;
6116                                 space_info->bytes_may_use += len;
6117                                 if (global_rsv->reserved >= global_rsv->size)
6118                                         global_rsv->full = 1;
6119                         }
6120                         spin_unlock(&global_rsv->lock);
6121                 }
6122                 spin_unlock(&space_info->lock);
6123         }
6124
6125         if (cache)
6126                 btrfs_put_block_group(cache);
6127         return 0;
6128 }
6129
6130 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6131                                struct btrfs_root *root)
6132 {
6133         struct btrfs_fs_info *fs_info = root->fs_info;
6134         struct extent_io_tree *unpin;
6135         u64 start;
6136         u64 end;
6137         int ret;
6138
6139         if (trans->aborted)
6140                 return 0;
6141
6142         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6143                 unpin = &fs_info->freed_extents[1];
6144         else
6145                 unpin = &fs_info->freed_extents[0];
6146
6147         while (1) {
6148                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6149                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6150                                             EXTENT_DIRTY, NULL);
6151                 if (ret) {
6152                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6153                         break;
6154                 }
6155
6156                 if (btrfs_test_opt(root, DISCARD))
6157                         ret = btrfs_discard_extent(root, start,
6158                                                    end + 1 - start, NULL);
6159
6160                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
6161                 unpin_extent_range(root, start, end, true);
6162                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6163                 cond_resched();
6164         }
6165
6166         return 0;
6167 }
6168
6169 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6170                              u64 owner, u64 root_objectid)
6171 {
6172         struct btrfs_space_info *space_info;
6173         u64 flags;
6174
6175         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6176                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6177                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
6178                 else
6179                         flags = BTRFS_BLOCK_GROUP_METADATA;
6180         } else {
6181                 flags = BTRFS_BLOCK_GROUP_DATA;
6182         }
6183
6184         space_info = __find_space_info(fs_info, flags);
6185         BUG_ON(!space_info); /* Logic bug */
6186         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6187 }
6188
6189
6190 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6191                                 struct btrfs_root *root,
6192                                 struct btrfs_delayed_ref_node *node, u64 parent,
6193                                 u64 root_objectid, u64 owner_objectid,
6194                                 u64 owner_offset, int refs_to_drop,
6195                                 struct btrfs_delayed_extent_op *extent_op)
6196 {
6197         struct btrfs_key key;
6198         struct btrfs_path *path;
6199         struct btrfs_fs_info *info = root->fs_info;
6200         struct btrfs_root *extent_root = info->extent_root;
6201         struct extent_buffer *leaf;
6202         struct btrfs_extent_item *ei;
6203         struct btrfs_extent_inline_ref *iref;
6204         int ret;
6205         int is_data;
6206         int extent_slot = 0;
6207         int found_extent = 0;
6208         int num_to_del = 1;
6209         int no_quota = node->no_quota;
6210         u32 item_size;
6211         u64 refs;
6212         u64 bytenr = node->bytenr;
6213         u64 num_bytes = node->num_bytes;
6214         int last_ref = 0;
6215         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6216                                                  SKINNY_METADATA);
6217
6218         if (!info->quota_enabled || !is_fstree(root_objectid))
6219                 no_quota = 1;
6220
6221         path = btrfs_alloc_path();
6222         if (!path)
6223                 return -ENOMEM;
6224
6225         path->reada = 1;
6226         path->leave_spinning = 1;
6227
6228         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6229         BUG_ON(!is_data && refs_to_drop != 1);
6230
6231         if (is_data)
6232                 skinny_metadata = 0;
6233
6234         ret = lookup_extent_backref(trans, extent_root, path, &iref,
6235                                     bytenr, num_bytes, parent,
6236                                     root_objectid, owner_objectid,
6237                                     owner_offset);
6238         if (ret == 0) {
6239                 extent_slot = path->slots[0];
6240                 while (extent_slot >= 0) {
6241                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6242                                               extent_slot);
6243                         if (key.objectid != bytenr)
6244                                 break;
6245                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6246                             key.offset == num_bytes) {
6247                                 found_extent = 1;
6248                                 break;
6249                         }
6250                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6251                             key.offset == owner_objectid) {
6252                                 found_extent = 1;
6253                                 break;
6254                         }
6255                         if (path->slots[0] - extent_slot > 5)
6256                                 break;
6257                         extent_slot--;
6258                 }
6259 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6260                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6261                 if (found_extent && item_size < sizeof(*ei))
6262                         found_extent = 0;
6263 #endif
6264                 if (!found_extent) {
6265                         BUG_ON(iref);
6266                         ret = remove_extent_backref(trans, extent_root, path,
6267                                                     NULL, refs_to_drop,
6268                                                     is_data, &last_ref);
6269                         if (ret) {
6270                                 btrfs_abort_transaction(trans, extent_root, ret);
6271                                 goto out;
6272                         }
6273                         btrfs_release_path(path);
6274                         path->leave_spinning = 1;
6275
6276                         key.objectid = bytenr;
6277                         key.type = BTRFS_EXTENT_ITEM_KEY;
6278                         key.offset = num_bytes;
6279
6280                         if (!is_data && skinny_metadata) {
6281                                 key.type = BTRFS_METADATA_ITEM_KEY;
6282                                 key.offset = owner_objectid;
6283                         }
6284
6285                         ret = btrfs_search_slot(trans, extent_root,
6286                                                 &key, path, -1, 1);
6287                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6288                                 /*
6289                                  * Couldn't find our skinny metadata item,
6290                                  * see if we have ye olde extent item.
6291                                  */
6292                                 path->slots[0]--;
6293                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6294                                                       path->slots[0]);
6295                                 if (key.objectid == bytenr &&
6296                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6297                                     key.offset == num_bytes)
6298                                         ret = 0;
6299                         }
6300
6301                         if (ret > 0 && skinny_metadata) {
6302                                 skinny_metadata = false;
6303                                 key.objectid = bytenr;
6304                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6305                                 key.offset = num_bytes;
6306                                 btrfs_release_path(path);
6307                                 ret = btrfs_search_slot(trans, extent_root,
6308                                                         &key, path, -1, 1);
6309                         }
6310
6311                         if (ret) {
6312                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6313                                         ret, bytenr);
6314                                 if (ret > 0)
6315                                         btrfs_print_leaf(extent_root,
6316                                                          path->nodes[0]);
6317                         }
6318                         if (ret < 0) {
6319                                 btrfs_abort_transaction(trans, extent_root, ret);
6320                                 goto out;
6321                         }
6322                         extent_slot = path->slots[0];
6323                 }
6324         } else if (WARN_ON(ret == -ENOENT)) {
6325                 btrfs_print_leaf(extent_root, path->nodes[0]);
6326                 btrfs_err(info,
6327                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6328                         bytenr, parent, root_objectid, owner_objectid,
6329                         owner_offset);
6330                 btrfs_abort_transaction(trans, extent_root, ret);
6331                 goto out;
6332         } else {
6333                 btrfs_abort_transaction(trans, extent_root, ret);
6334                 goto out;
6335         }
6336
6337         leaf = path->nodes[0];
6338         item_size = btrfs_item_size_nr(leaf, extent_slot);
6339 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6340         if (item_size < sizeof(*ei)) {
6341                 BUG_ON(found_extent || extent_slot != path->slots[0]);
6342                 ret = convert_extent_item_v0(trans, extent_root, path,
6343                                              owner_objectid, 0);
6344                 if (ret < 0) {
6345                         btrfs_abort_transaction(trans, extent_root, ret);
6346                         goto out;
6347                 }
6348
6349                 btrfs_release_path(path);
6350                 path->leave_spinning = 1;
6351
6352                 key.objectid = bytenr;
6353                 key.type = BTRFS_EXTENT_ITEM_KEY;
6354                 key.offset = num_bytes;
6355
6356                 ret = btrfs_search_slot(trans, extent_root, &key, path,
6357                                         -1, 1);
6358                 if (ret) {
6359                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6360                                 ret, bytenr);
6361                         btrfs_print_leaf(extent_root, path->nodes[0]);
6362                 }
6363                 if (ret < 0) {
6364                         btrfs_abort_transaction(trans, extent_root, ret);
6365                         goto out;
6366                 }
6367
6368                 extent_slot = path->slots[0];
6369                 leaf = path->nodes[0];
6370                 item_size = btrfs_item_size_nr(leaf, extent_slot);
6371         }
6372 #endif
6373         BUG_ON(item_size < sizeof(*ei));
6374         ei = btrfs_item_ptr(leaf, extent_slot,
6375                             struct btrfs_extent_item);
6376         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6377             key.type == BTRFS_EXTENT_ITEM_KEY) {
6378                 struct btrfs_tree_block_info *bi;
6379                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6380                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6381                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6382         }
6383
6384         refs = btrfs_extent_refs(leaf, ei);
6385         if (refs < refs_to_drop) {
6386                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
6387                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
6388                 ret = -EINVAL;
6389                 btrfs_abort_transaction(trans, extent_root, ret);
6390                 goto out;
6391         }
6392         refs -= refs_to_drop;
6393
6394         if (refs > 0) {
6395                 if (extent_op)
6396                         __run_delayed_extent_op(extent_op, leaf, ei);
6397                 /*
6398                  * In the case of inline back ref, reference count will
6399                  * be updated by remove_extent_backref
6400                  */
6401                 if (iref) {
6402                         BUG_ON(!found_extent);
6403                 } else {
6404                         btrfs_set_extent_refs(leaf, ei, refs);
6405                         btrfs_mark_buffer_dirty(leaf);
6406                 }
6407                 if (found_extent) {
6408                         ret = remove_extent_backref(trans, extent_root, path,
6409                                                     iref, refs_to_drop,
6410                                                     is_data, &last_ref);
6411                         if (ret) {
6412                                 btrfs_abort_transaction(trans, extent_root, ret);
6413                                 goto out;
6414                         }
6415                 }
6416                 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
6417                                  root_objectid);
6418         } else {
6419                 if (found_extent) {
6420                         BUG_ON(is_data && refs_to_drop !=
6421                                extent_data_ref_count(root, path, iref));
6422                         if (iref) {
6423                                 BUG_ON(path->slots[0] != extent_slot);
6424                         } else {
6425                                 BUG_ON(path->slots[0] != extent_slot + 1);
6426                                 path->slots[0] = extent_slot;
6427                                 num_to_del = 2;
6428                         }
6429                 }
6430
6431                 last_ref = 1;
6432                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6433                                       num_to_del);
6434                 if (ret) {
6435                         btrfs_abort_transaction(trans, extent_root, ret);
6436                         goto out;
6437                 }
6438                 btrfs_release_path(path);
6439
6440                 if (is_data) {
6441                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
6442                         if (ret) {
6443                                 btrfs_abort_transaction(trans, extent_root, ret);
6444                                 goto out;
6445                         }
6446                 }
6447
6448                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
6449                 if (ret) {
6450                         btrfs_abort_transaction(trans, extent_root, ret);
6451                         goto out;
6452                 }
6453         }
6454         btrfs_release_path(path);
6455
6456 out:
6457         btrfs_free_path(path);
6458         return ret;
6459 }
6460
6461 /*
6462  * when we free an block, it is possible (and likely) that we free the last
6463  * delayed ref for that extent as well.  This searches the delayed ref tree for
6464  * a given extent, and if there are no other delayed refs to be processed, it
6465  * removes it from the tree.
6466  */
6467 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6468                                       struct btrfs_root *root, u64 bytenr)
6469 {
6470         struct btrfs_delayed_ref_head *head;
6471         struct btrfs_delayed_ref_root *delayed_refs;
6472         int ret = 0;
6473
6474         delayed_refs = &trans->transaction->delayed_refs;
6475         spin_lock(&delayed_refs->lock);
6476         head = btrfs_find_delayed_ref_head(trans, bytenr);
6477         if (!head)
6478                 goto out_delayed_unlock;
6479
6480         spin_lock(&head->lock);
6481         if (!list_empty(&head->ref_list))
6482                 goto out;
6483
6484         if (head->extent_op) {
6485                 if (!head->must_insert_reserved)
6486                         goto out;
6487                 btrfs_free_delayed_extent_op(head->extent_op);
6488                 head->extent_op = NULL;
6489         }
6490
6491         /*
6492          * waiting for the lock here would deadlock.  If someone else has it
6493          * locked they are already in the process of dropping it anyway
6494          */
6495         if (!mutex_trylock(&head->mutex))
6496                 goto out;
6497
6498         /*
6499          * at this point we have a head with no other entries.  Go
6500          * ahead and process it.
6501          */
6502         head->node.in_tree = 0;
6503         rb_erase(&head->href_node, &delayed_refs->href_root);
6504
6505         atomic_dec(&delayed_refs->num_entries);
6506
6507         /*
6508          * we don't take a ref on the node because we're removing it from the
6509          * tree, so we just steal the ref the tree was holding.
6510          */
6511         delayed_refs->num_heads--;
6512         if (head->processing == 0)
6513                 delayed_refs->num_heads_ready--;
6514         head->processing = 0;
6515         spin_unlock(&head->lock);
6516         spin_unlock(&delayed_refs->lock);
6517
6518         BUG_ON(head->extent_op);
6519         if (head->must_insert_reserved)
6520                 ret = 1;
6521
6522         mutex_unlock(&head->mutex);
6523         btrfs_put_delayed_ref(&head->node);
6524         return ret;
6525 out:
6526         spin_unlock(&head->lock);
6527
6528 out_delayed_unlock:
6529         spin_unlock(&delayed_refs->lock);
6530         return 0;
6531 }
6532
6533 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6534                            struct btrfs_root *root,
6535                            struct extent_buffer *buf,
6536                            u64 parent, int last_ref)
6537 {
6538         int pin = 1;
6539         int ret;
6540
6541         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6542                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6543                                         buf->start, buf->len,
6544                                         parent, root->root_key.objectid,
6545                                         btrfs_header_level(buf),
6546                                         BTRFS_DROP_DELAYED_REF, NULL, 0);
6547                 BUG_ON(ret); /* -ENOMEM */
6548         }
6549
6550         if (!last_ref)
6551                 return;
6552
6553         if (btrfs_header_generation(buf) == trans->transid) {
6554                 struct btrfs_block_group_cache *cache;
6555
6556                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6557                         ret = check_ref_cleanup(trans, root, buf->start);
6558                         if (!ret)
6559                                 goto out;
6560                 }
6561
6562                 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6563
6564                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6565                         pin_down_extent(root, cache, buf->start, buf->len, 1);
6566                         btrfs_put_block_group(cache);
6567                         goto out;
6568                 }
6569
6570                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6571
6572                 btrfs_add_free_space(cache, buf->start, buf->len);
6573                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6574                 btrfs_put_block_group(cache);
6575                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6576                 pin = 0;
6577         }
6578 out:
6579         if (pin)
6580                 add_pinned_bytes(root->fs_info, buf->len,
6581                                  btrfs_header_level(buf),
6582                                  root->root_key.objectid);
6583
6584         /*
6585          * Deleting the buffer, clear the corrupt flag since it doesn't matter
6586          * anymore.
6587          */
6588         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6589 }
6590
6591 /* Can return -ENOMEM */
6592 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6593                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6594                       u64 owner, u64 offset, int no_quota)
6595 {
6596         int ret;
6597         struct btrfs_fs_info *fs_info = root->fs_info;
6598
6599         if (btrfs_test_is_dummy_root(root))
6600                 return 0;
6601
6602         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6603
6604         /*
6605          * tree log blocks never actually go into the extent allocation
6606          * tree, just update pinning info and exit early.
6607          */
6608         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6609                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6610                 /* unlocks the pinned mutex */
6611                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
6612                 ret = 0;
6613         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6614                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6615                                         num_bytes,
6616                                         parent, root_objectid, (int)owner,
6617                                         BTRFS_DROP_DELAYED_REF, NULL, no_quota);
6618         } else {
6619                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6620                                                 num_bytes,
6621                                                 parent, root_objectid, owner,
6622                                                 offset, BTRFS_DROP_DELAYED_REF,
6623                                                 NULL, no_quota);
6624         }
6625         return ret;
6626 }
6627
6628 /*
6629  * when we wait for progress in the block group caching, its because
6630  * our allocation attempt failed at least once.  So, we must sleep
6631  * and let some progress happen before we try again.
6632  *
6633  * This function will sleep at least once waiting for new free space to
6634  * show up, and then it will check the block group free space numbers
6635  * for our min num_bytes.  Another option is to have it go ahead
6636  * and look in the rbtree for a free extent of a given size, but this
6637  * is a good start.
6638  *
6639  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6640  * any of the information in this block group.
6641  */
6642 static noinline void
6643 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6644                                 u64 num_bytes)
6645 {
6646         struct btrfs_caching_control *caching_ctl;
6647
6648         caching_ctl = get_caching_control(cache);
6649         if (!caching_ctl)
6650                 return;
6651
6652         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6653                    (cache->free_space_ctl->free_space >= num_bytes));
6654
6655         put_caching_control(caching_ctl);
6656 }
6657
6658 static noinline int
6659 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6660 {
6661         struct btrfs_caching_control *caching_ctl;
6662         int ret = 0;
6663
6664         caching_ctl = get_caching_control(cache);
6665         if (!caching_ctl)
6666                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6667
6668         wait_event(caching_ctl->wait, block_group_cache_done(cache));
6669         if (cache->cached == BTRFS_CACHE_ERROR)
6670                 ret = -EIO;
6671         put_caching_control(caching_ctl);
6672         return ret;
6673 }
6674
6675 int __get_raid_index(u64 flags)
6676 {
6677         if (flags & BTRFS_BLOCK_GROUP_RAID10)
6678                 return BTRFS_RAID_RAID10;
6679         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6680                 return BTRFS_RAID_RAID1;
6681         else if (flags & BTRFS_BLOCK_GROUP_DUP)
6682                 return BTRFS_RAID_DUP;
6683         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6684                 return BTRFS_RAID_RAID0;
6685         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6686                 return BTRFS_RAID_RAID5;
6687         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6688                 return BTRFS_RAID_RAID6;
6689
6690         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6691 }
6692
6693 int get_block_group_index(struct btrfs_block_group_cache *cache)
6694 {
6695         return __get_raid_index(cache->flags);
6696 }
6697
6698 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6699         [BTRFS_RAID_RAID10]     = "raid10",
6700         [BTRFS_RAID_RAID1]      = "raid1",
6701         [BTRFS_RAID_DUP]        = "dup",
6702         [BTRFS_RAID_RAID0]      = "raid0",
6703         [BTRFS_RAID_SINGLE]     = "single",
6704         [BTRFS_RAID_RAID5]      = "raid5",
6705         [BTRFS_RAID_RAID6]      = "raid6",
6706 };
6707
6708 static const char *get_raid_name(enum btrfs_raid_types type)
6709 {
6710         if (type >= BTRFS_NR_RAID_TYPES)
6711                 return NULL;
6712
6713         return btrfs_raid_type_names[type];
6714 }
6715
6716 enum btrfs_loop_type {
6717         LOOP_CACHING_NOWAIT = 0,
6718         LOOP_CACHING_WAIT = 1,
6719         LOOP_ALLOC_CHUNK = 2,
6720         LOOP_NO_EMPTY_SIZE = 3,
6721 };
6722
6723 static inline void
6724 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6725                        int delalloc)
6726 {
6727         if (delalloc)
6728                 down_read(&cache->data_rwsem);
6729 }
6730
6731 static inline void
6732 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6733                        int delalloc)
6734 {
6735         btrfs_get_block_group(cache);
6736         if (delalloc)
6737                 down_read(&cache->data_rwsem);
6738 }
6739
6740 static struct btrfs_block_group_cache *
6741 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6742                    struct btrfs_free_cluster *cluster,
6743                    int delalloc)
6744 {
6745         struct btrfs_block_group_cache *used_bg;
6746         bool locked = false;
6747 again:
6748         spin_lock(&cluster->refill_lock);
6749         if (locked) {
6750                 if (used_bg == cluster->block_group)
6751                         return used_bg;
6752
6753                 up_read(&used_bg->data_rwsem);
6754                 btrfs_put_block_group(used_bg);
6755         }
6756
6757         used_bg = cluster->block_group;
6758         if (!used_bg)
6759                 return NULL;
6760
6761         if (used_bg == block_group)
6762                 return used_bg;
6763
6764         btrfs_get_block_group(used_bg);
6765
6766         if (!delalloc)
6767                 return used_bg;
6768
6769         if (down_read_trylock(&used_bg->data_rwsem))
6770                 return used_bg;
6771
6772         spin_unlock(&cluster->refill_lock);
6773         down_read(&used_bg->data_rwsem);
6774         locked = true;
6775         goto again;
6776 }
6777
6778 static inline void
6779 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6780                          int delalloc)
6781 {
6782         if (delalloc)
6783                 up_read(&cache->data_rwsem);
6784         btrfs_put_block_group(cache);
6785 }
6786
6787 /*
6788  * walks the btree of allocated extents and find a hole of a given size.
6789  * The key ins is changed to record the hole:
6790  * ins->objectid == start position
6791  * ins->flags = BTRFS_EXTENT_ITEM_KEY
6792  * ins->offset == the size of the hole.
6793  * Any available blocks before search_start are skipped.
6794  *
6795  * If there is no suitable free space, we will record the max size of
6796  * the free space extent currently.
6797  */
6798 static noinline int find_free_extent(struct btrfs_root *orig_root,
6799                                      u64 num_bytes, u64 empty_size,
6800                                      u64 hint_byte, struct btrfs_key *ins,
6801                                      u64 flags, int delalloc)
6802 {
6803         int ret = 0;
6804         struct btrfs_root *root = orig_root->fs_info->extent_root;
6805         struct btrfs_free_cluster *last_ptr = NULL;
6806         struct btrfs_block_group_cache *block_group = NULL;
6807         u64 search_start = 0;
6808         u64 max_extent_size = 0;
6809         int empty_cluster = 2 * 1024 * 1024;
6810         struct btrfs_space_info *space_info;
6811         int loop = 0;
6812         int index = __get_raid_index(flags);
6813         int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6814                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6815         bool failed_cluster_refill = false;
6816         bool failed_alloc = false;
6817         bool use_cluster = true;
6818         bool have_caching_bg = false;
6819
6820         WARN_ON(num_bytes < root->sectorsize);
6821         ins->type = BTRFS_EXTENT_ITEM_KEY;
6822         ins->objectid = 0;
6823         ins->offset = 0;
6824
6825         trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
6826
6827         space_info = __find_space_info(root->fs_info, flags);
6828         if (!space_info) {
6829                 btrfs_err(root->fs_info, "No space info for %llu", flags);
6830                 return -ENOSPC;
6831         }
6832
6833         /*
6834          * If the space info is for both data and metadata it means we have a
6835          * small filesystem and we can't use the clustering stuff.
6836          */
6837         if (btrfs_mixed_space_info(space_info))
6838                 use_cluster = false;
6839
6840         if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
6841                 last_ptr = &root->fs_info->meta_alloc_cluster;
6842                 if (!btrfs_test_opt(root, SSD))
6843                         empty_cluster = 64 * 1024;
6844         }
6845
6846         if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
6847             btrfs_test_opt(root, SSD)) {
6848                 last_ptr = &root->fs_info->data_alloc_cluster;
6849         }
6850
6851         if (last_ptr) {
6852                 spin_lock(&last_ptr->lock);
6853                 if (last_ptr->block_group)
6854                         hint_byte = last_ptr->window_start;
6855                 spin_unlock(&last_ptr->lock);
6856         }
6857
6858         search_start = max(search_start, first_logical_byte(root, 0));
6859         search_start = max(search_start, hint_byte);
6860
6861         if (!last_ptr)
6862                 empty_cluster = 0;
6863
6864         if (search_start == hint_byte) {
6865                 block_group = btrfs_lookup_block_group(root->fs_info,
6866                                                        search_start);
6867                 /*
6868                  * we don't want to use the block group if it doesn't match our
6869                  * allocation bits, or if its not cached.
6870                  *
6871                  * However if we are re-searching with an ideal block group
6872                  * picked out then we don't care that the block group is cached.
6873                  */
6874                 if (block_group && block_group_bits(block_group, flags) &&
6875                     block_group->cached != BTRFS_CACHE_NO) {
6876                         down_read(&space_info->groups_sem);
6877                         if (list_empty(&block_group->list) ||
6878                             block_group->ro) {
6879                                 /*
6880                                  * someone is removing this block group,
6881                                  * we can't jump into the have_block_group
6882                                  * target because our list pointers are not
6883                                  * valid
6884                                  */
6885                                 btrfs_put_block_group(block_group);
6886                                 up_read(&space_info->groups_sem);
6887                         } else {
6888                                 index = get_block_group_index(block_group);
6889                                 btrfs_lock_block_group(block_group, delalloc);
6890                                 goto have_block_group;
6891                         }
6892                 } else if (block_group) {
6893                         btrfs_put_block_group(block_group);
6894                 }
6895         }
6896 search:
6897         have_caching_bg = false;
6898         down_read(&space_info->groups_sem);
6899         list_for_each_entry(block_group, &space_info->block_groups[index],
6900                             list) {
6901                 u64 offset;
6902                 int cached;
6903
6904                 btrfs_grab_block_group(block_group, delalloc);
6905                 search_start = block_group->key.objectid;
6906
6907                 /*
6908                  * this can happen if we end up cycling through all the
6909                  * raid types, but we want to make sure we only allocate
6910                  * for the proper type.
6911                  */
6912                 if (!block_group_bits(block_group, flags)) {
6913                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
6914                                 BTRFS_BLOCK_GROUP_RAID1 |
6915                                 BTRFS_BLOCK_GROUP_RAID5 |
6916                                 BTRFS_BLOCK_GROUP_RAID6 |
6917                                 BTRFS_BLOCK_GROUP_RAID10;
6918
6919                         /*
6920                          * if they asked for extra copies and this block group
6921                          * doesn't provide them, bail.  This does allow us to
6922                          * fill raid0 from raid1.
6923                          */
6924                         if ((flags & extra) && !(block_group->flags & extra))
6925                                 goto loop;
6926                 }
6927
6928 have_block_group:
6929                 cached = block_group_cache_done(block_group);
6930                 if (unlikely(!cached)) {
6931                         ret = cache_block_group(block_group, 0);
6932                         BUG_ON(ret < 0);
6933                         ret = 0;
6934                 }
6935
6936                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6937                         goto loop;
6938                 if (unlikely(block_group->ro))
6939                         goto loop;
6940
6941                 /*
6942                  * Ok we want to try and use the cluster allocator, so
6943                  * lets look there
6944                  */
6945                 if (last_ptr) {
6946                         struct btrfs_block_group_cache *used_block_group;
6947                         unsigned long aligned_cluster;
6948                         /*
6949                          * the refill lock keeps out other
6950                          * people trying to start a new cluster
6951                          */
6952                         used_block_group = btrfs_lock_cluster(block_group,
6953                                                               last_ptr,
6954                                                               delalloc);
6955                         if (!used_block_group)
6956                                 goto refill_cluster;
6957
6958                         if (used_block_group != block_group &&
6959                             (used_block_group->ro ||
6960                              !block_group_bits(used_block_group, flags)))
6961                                 goto release_cluster;
6962
6963                         offset = btrfs_alloc_from_cluster(used_block_group,
6964                                                 last_ptr,
6965                                                 num_bytes,
6966                                                 used_block_group->key.objectid,
6967                                                 &max_extent_size);
6968                         if (offset) {
6969                                 /* we have a block, we're done */
6970                                 spin_unlock(&last_ptr->refill_lock);
6971                                 trace_btrfs_reserve_extent_cluster(root,
6972                                                 used_block_group,
6973                                                 search_start, num_bytes);
6974                                 if (used_block_group != block_group) {
6975                                         btrfs_release_block_group(block_group,
6976                                                                   delalloc);
6977                                         block_group = used_block_group;
6978                                 }
6979                                 goto checks;
6980                         }
6981
6982                         WARN_ON(last_ptr->block_group != used_block_group);
6983 release_cluster:
6984                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6985                          * set up a new clusters, so lets just skip it
6986                          * and let the allocator find whatever block
6987                          * it can find.  If we reach this point, we
6988                          * will have tried the cluster allocator
6989                          * plenty of times and not have found
6990                          * anything, so we are likely way too
6991                          * fragmented for the clustering stuff to find
6992                          * anything.
6993                          *
6994                          * However, if the cluster is taken from the
6995                          * current block group, release the cluster
6996                          * first, so that we stand a better chance of
6997                          * succeeding in the unclustered
6998                          * allocation.  */
6999                         if (loop >= LOOP_NO_EMPTY_SIZE &&
7000                             used_block_group != block_group) {
7001                                 spin_unlock(&last_ptr->refill_lock);
7002                                 btrfs_release_block_group(used_block_group,
7003                                                           delalloc);
7004                                 goto unclustered_alloc;
7005                         }
7006
7007                         /*
7008                          * this cluster didn't work out, free it and
7009                          * start over
7010                          */
7011                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7012
7013                         if (used_block_group != block_group)
7014                                 btrfs_release_block_group(used_block_group,
7015                                                           delalloc);
7016 refill_cluster:
7017                         if (loop >= LOOP_NO_EMPTY_SIZE) {
7018                                 spin_unlock(&last_ptr->refill_lock);
7019                                 goto unclustered_alloc;
7020                         }
7021
7022                         aligned_cluster = max_t(unsigned long,
7023                                                 empty_cluster + empty_size,
7024                                               block_group->full_stripe_len);
7025
7026                         /* allocate a cluster in this block group */
7027                         ret = btrfs_find_space_cluster(root, block_group,
7028                                                        last_ptr, search_start,
7029                                                        num_bytes,
7030                                                        aligned_cluster);
7031                         if (ret == 0) {
7032                                 /*
7033                                  * now pull our allocation out of this
7034                                  * cluster
7035                                  */
7036                                 offset = btrfs_alloc_from_cluster(block_group,
7037                                                         last_ptr,
7038                                                         num_bytes,
7039                                                         search_start,
7040                                                         &max_extent_size);
7041                                 if (offset) {
7042                                         /* we found one, proceed */
7043                                         spin_unlock(&last_ptr->refill_lock);
7044                                         trace_btrfs_reserve_extent_cluster(root,
7045                                                 block_group, search_start,
7046                                                 num_bytes);
7047                                         goto checks;
7048                                 }
7049                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
7050                                    && !failed_cluster_refill) {
7051                                 spin_unlock(&last_ptr->refill_lock);
7052
7053                                 failed_cluster_refill = true;
7054                                 wait_block_group_cache_progress(block_group,
7055                                        num_bytes + empty_cluster + empty_size);
7056                                 goto have_block_group;
7057                         }
7058
7059                         /*
7060                          * at this point we either didn't find a cluster
7061                          * or we weren't able to allocate a block from our
7062                          * cluster.  Free the cluster we've been trying
7063                          * to use, and go to the next block group
7064                          */
7065                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7066                         spin_unlock(&last_ptr->refill_lock);
7067                         goto loop;
7068                 }
7069
7070 unclustered_alloc:
7071                 spin_lock(&block_group->free_space_ctl->tree_lock);
7072                 if (cached &&
7073                     block_group->free_space_ctl->free_space <
7074                     num_bytes + empty_cluster + empty_size) {
7075                         if (block_group->free_space_ctl->free_space >
7076                             max_extent_size)
7077                                 max_extent_size =
7078                                         block_group->free_space_ctl->free_space;
7079                         spin_unlock(&block_group->free_space_ctl->tree_lock);
7080                         goto loop;
7081                 }
7082                 spin_unlock(&block_group->free_space_ctl->tree_lock);
7083
7084                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7085                                                     num_bytes, empty_size,
7086                                                     &max_extent_size);
7087                 /*
7088                  * If we didn't find a chunk, and we haven't failed on this
7089                  * block group before, and this block group is in the middle of
7090                  * caching and we are ok with waiting, then go ahead and wait
7091                  * for progress to be made, and set failed_alloc to true.
7092                  *
7093                  * If failed_alloc is true then we've already waited on this
7094                  * block group once and should move on to the next block group.
7095                  */
7096                 if (!offset && !failed_alloc && !cached &&
7097                     loop > LOOP_CACHING_NOWAIT) {
7098                         wait_block_group_cache_progress(block_group,
7099                                                 num_bytes + empty_size);
7100                         failed_alloc = true;
7101                         goto have_block_group;
7102                 } else if (!offset) {
7103                         if (!cached)
7104                                 have_caching_bg = true;
7105                         goto loop;
7106                 }
7107 checks:
7108                 search_start = ALIGN(offset, root->stripesize);
7109
7110                 /* move on to the next group */
7111                 if (search_start + num_bytes >
7112                     block_group->key.objectid + block_group->key.offset) {
7113                         btrfs_add_free_space(block_group, offset, num_bytes);
7114                         goto loop;
7115                 }
7116
7117                 if (offset < search_start)
7118                         btrfs_add_free_space(block_group, offset,
7119                                              search_start - offset);
7120                 BUG_ON(offset > search_start);
7121
7122                 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
7123                                                   alloc_type, delalloc);
7124                 if (ret == -EAGAIN) {
7125                         btrfs_add_free_space(block_group, offset, num_bytes);
7126                         goto loop;
7127                 }
7128
7129                 /* we are all good, lets return */
7130                 ins->objectid = search_start;
7131                 ins->offset = num_bytes;
7132
7133                 trace_btrfs_reserve_extent(orig_root, block_group,
7134                                            search_start, num_bytes);
7135                 btrfs_release_block_group(block_group, delalloc);
7136                 break;
7137 loop:
7138                 failed_cluster_refill = false;
7139                 failed_alloc = false;
7140                 BUG_ON(index != get_block_group_index(block_group));
7141                 btrfs_release_block_group(block_group, delalloc);
7142         }
7143         up_read(&space_info->groups_sem);
7144
7145         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7146                 goto search;
7147
7148         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7149                 goto search;
7150
7151         /*
7152          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7153          *                      caching kthreads as we move along
7154          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7155          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7156          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7157          *                      again
7158          */
7159         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7160                 index = 0;
7161                 loop++;
7162                 if (loop == LOOP_ALLOC_CHUNK) {
7163                         struct btrfs_trans_handle *trans;
7164                         int exist = 0;
7165
7166                         trans = current->journal_info;
7167                         if (trans)
7168                                 exist = 1;
7169                         else
7170                                 trans = btrfs_join_transaction(root);
7171
7172                         if (IS_ERR(trans)) {
7173                                 ret = PTR_ERR(trans);
7174                                 goto out;
7175                         }
7176
7177                         ret = do_chunk_alloc(trans, root, flags,
7178                                              CHUNK_ALLOC_FORCE);
7179                         /*
7180                          * Do not bail out on ENOSPC since we
7181                          * can do more things.
7182                          */
7183                         if (ret < 0 && ret != -ENOSPC)
7184                                 btrfs_abort_transaction(trans,
7185                                                         root, ret);
7186                         else
7187                                 ret = 0;
7188                         if (!exist)
7189                                 btrfs_end_transaction(trans, root);
7190                         if (ret)
7191                                 goto out;
7192                 }
7193
7194                 if (loop == LOOP_NO_EMPTY_SIZE) {
7195                         empty_size = 0;
7196                         empty_cluster = 0;
7197                 }
7198
7199                 goto search;
7200         } else if (!ins->objectid) {
7201                 ret = -ENOSPC;
7202         } else if (ins->objectid) {
7203                 ret = 0;
7204         }
7205 out:
7206         if (ret == -ENOSPC)
7207                 ins->offset = max_extent_size;
7208         return ret;
7209 }
7210
7211 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7212                             int dump_block_groups)
7213 {
7214         struct btrfs_block_group_cache *cache;
7215         int index = 0;
7216
7217         spin_lock(&info->lock);
7218         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7219                info->flags,
7220                info->total_bytes - info->bytes_used - info->bytes_pinned -
7221                info->bytes_reserved - info->bytes_readonly,
7222                (info->full) ? "" : "not ");
7223         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7224                "reserved=%llu, may_use=%llu, readonly=%llu\n",
7225                info->total_bytes, info->bytes_used, info->bytes_pinned,
7226                info->bytes_reserved, info->bytes_may_use,
7227                info->bytes_readonly);
7228         spin_unlock(&info->lock);
7229
7230         if (!dump_block_groups)
7231                 return;
7232
7233         down_read(&info->groups_sem);
7234 again:
7235         list_for_each_entry(cache, &info->block_groups[index], list) {
7236                 spin_lock(&cache->lock);
7237                 printk(KERN_INFO "BTRFS: "
7238                            "block group %llu has %llu bytes, "
7239                            "%llu used %llu pinned %llu reserved %s\n",
7240                        cache->key.objectid, cache->key.offset,
7241                        btrfs_block_group_used(&cache->item), cache->pinned,
7242                        cache->reserved, cache->ro ? "[readonly]" : "");
7243                 btrfs_dump_free_space(cache, bytes);
7244                 spin_unlock(&cache->lock);
7245         }
7246         if (++index < BTRFS_NR_RAID_TYPES)
7247                 goto again;
7248         up_read(&info->groups_sem);
7249 }
7250
7251 int btrfs_reserve_extent(struct btrfs_root *root,
7252                          u64 num_bytes, u64 min_alloc_size,
7253                          u64 empty_size, u64 hint_byte,
7254                          struct btrfs_key *ins, int is_data, int delalloc)
7255 {
7256         bool final_tried = false;
7257         u64 flags;
7258         int ret;
7259
7260         flags = btrfs_get_alloc_profile(root, is_data);
7261 again:
7262         WARN_ON(num_bytes < root->sectorsize);
7263         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7264                                flags, delalloc);
7265
7266         if (ret == -ENOSPC) {
7267                 if (!final_tried && ins->offset) {
7268                         num_bytes = min(num_bytes >> 1, ins->offset);
7269                         num_bytes = round_down(num_bytes, root->sectorsize);
7270                         num_bytes = max(num_bytes, min_alloc_size);
7271                         if (num_bytes == min_alloc_size)
7272                                 final_tried = true;
7273                         goto again;
7274                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7275                         struct btrfs_space_info *sinfo;
7276
7277                         sinfo = __find_space_info(root->fs_info, flags);
7278                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7279                                 flags, num_bytes);
7280                         if (sinfo)
7281                                 dump_space_info(sinfo, num_bytes, 1);
7282                 }
7283         }
7284
7285         return ret;
7286 }
7287
7288 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7289                                         u64 start, u64 len,
7290                                         int pin, int delalloc)
7291 {
7292         struct btrfs_block_group_cache *cache;
7293         int ret = 0;
7294
7295         cache = btrfs_lookup_block_group(root->fs_info, start);
7296         if (!cache) {
7297                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
7298                         start);
7299                 return -ENOSPC;
7300         }
7301
7302         if (pin)
7303                 pin_down_extent(root, cache, start, len, 1);
7304         else {
7305                 if (btrfs_test_opt(root, DISCARD))
7306                         ret = btrfs_discard_extent(root, start, len, NULL);
7307                 btrfs_add_free_space(cache, start, len);
7308                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
7309         }
7310
7311         btrfs_put_block_group(cache);
7312
7313         trace_btrfs_reserved_extent_free(root, start, len);
7314
7315         return ret;
7316 }
7317
7318 int btrfs_free_reserved_extent(struct btrfs_root *root,
7319                                u64 start, u64 len, int delalloc)
7320 {
7321         return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
7322 }
7323
7324 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
7325                                        u64 start, u64 len)
7326 {
7327         return __btrfs_free_reserved_extent(root, start, len, 1, 0);
7328 }
7329
7330 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7331                                       struct btrfs_root *root,
7332                                       u64 parent, u64 root_objectid,
7333                                       u64 flags, u64 owner, u64 offset,
7334                                       struct btrfs_key *ins, int ref_mod)
7335 {
7336         int ret;
7337         struct btrfs_fs_info *fs_info = root->fs_info;
7338         struct btrfs_extent_item *extent_item;
7339         struct btrfs_extent_inline_ref *iref;
7340         struct btrfs_path *path;
7341         struct extent_buffer *leaf;
7342         int type;
7343         u32 size;
7344
7345         if (parent > 0)
7346                 type = BTRFS_SHARED_DATA_REF_KEY;
7347         else
7348                 type = BTRFS_EXTENT_DATA_REF_KEY;
7349
7350         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7351
7352         path = btrfs_alloc_path();
7353         if (!path)
7354                 return -ENOMEM;
7355
7356         path->leave_spinning = 1;
7357         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7358                                       ins, size);
7359         if (ret) {
7360                 btrfs_free_path(path);
7361                 return ret;
7362         }
7363
7364         leaf = path->nodes[0];
7365         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7366                                      struct btrfs_extent_item);
7367         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7368         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7369         btrfs_set_extent_flags(leaf, extent_item,
7370                                flags | BTRFS_EXTENT_FLAG_DATA);
7371
7372         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7373         btrfs_set_extent_inline_ref_type(leaf, iref, type);
7374         if (parent > 0) {
7375                 struct btrfs_shared_data_ref *ref;
7376                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7377                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7378                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7379         } else {
7380                 struct btrfs_extent_data_ref *ref;
7381                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7382                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7383                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7384                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7385                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7386         }
7387
7388         btrfs_mark_buffer_dirty(path->nodes[0]);
7389         btrfs_free_path(path);
7390
7391         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
7392         if (ret) { /* -ENOENT, logic error */
7393                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7394                         ins->objectid, ins->offset);
7395                 BUG();
7396         }
7397         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
7398         return ret;
7399 }
7400
7401 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7402                                      struct btrfs_root *root,
7403                                      u64 parent, u64 root_objectid,
7404                                      u64 flags, struct btrfs_disk_key *key,
7405                                      int level, struct btrfs_key *ins,
7406                                      int no_quota)
7407 {
7408         int ret;
7409         struct btrfs_fs_info *fs_info = root->fs_info;
7410         struct btrfs_extent_item *extent_item;
7411         struct btrfs_tree_block_info *block_info;
7412         struct btrfs_extent_inline_ref *iref;
7413         struct btrfs_path *path;
7414         struct extent_buffer *leaf;
7415         u32 size = sizeof(*extent_item) + sizeof(*iref);
7416         u64 num_bytes = ins->offset;
7417         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7418                                                  SKINNY_METADATA);
7419
7420         if (!skinny_metadata)
7421                 size += sizeof(*block_info);
7422
7423         path = btrfs_alloc_path();
7424         if (!path) {
7425                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7426                                                    root->nodesize);
7427                 return -ENOMEM;
7428         }
7429
7430         path->leave_spinning = 1;
7431         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7432                                       ins, size);
7433         if (ret) {
7434                 btrfs_free_path(path);
7435                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7436                                                    root->nodesize);
7437                 return ret;
7438         }
7439
7440         leaf = path->nodes[0];
7441         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7442                                      struct btrfs_extent_item);
7443         btrfs_set_extent_refs(leaf, extent_item, 1);
7444         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7445         btrfs_set_extent_flags(leaf, extent_item,
7446                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7447
7448         if (skinny_metadata) {
7449                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7450                 num_bytes = root->nodesize;
7451         } else {
7452                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7453                 btrfs_set_tree_block_key(leaf, block_info, key);
7454                 btrfs_set_tree_block_level(leaf, block_info, level);
7455                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7456         }
7457
7458         if (parent > 0) {
7459                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7460                 btrfs_set_extent_inline_ref_type(leaf, iref,
7461                                                  BTRFS_SHARED_BLOCK_REF_KEY);
7462                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7463         } else {
7464                 btrfs_set_extent_inline_ref_type(leaf, iref,
7465                                                  BTRFS_TREE_BLOCK_REF_KEY);
7466                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
7467         }
7468
7469         btrfs_mark_buffer_dirty(leaf);
7470         btrfs_free_path(path);
7471
7472         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
7473                                  1);
7474         if (ret) { /* -ENOENT, logic error */
7475                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7476                         ins->objectid, ins->offset);
7477                 BUG();
7478         }
7479
7480         trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7481         return ret;
7482 }
7483
7484 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7485                                      struct btrfs_root *root,
7486                                      u64 root_objectid, u64 owner,
7487                                      u64 offset, struct btrfs_key *ins)
7488 {
7489         int ret;
7490
7491         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
7492
7493         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7494                                          ins->offset, 0,
7495                                          root_objectid, owner, offset,
7496                                          BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
7497         return ret;
7498 }
7499
7500 /*
7501  * this is used by the tree logging recovery code.  It records that
7502  * an extent has been allocated and makes sure to clear the free
7503  * space cache bits as well
7504  */
7505 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7506                                    struct btrfs_root *root,
7507                                    u64 root_objectid, u64 owner, u64 offset,
7508                                    struct btrfs_key *ins)
7509 {
7510         int ret;
7511         struct btrfs_block_group_cache *block_group;
7512
7513         /*
7514          * Mixed block groups will exclude before processing the log so we only
7515          * need to do the exlude dance if this fs isn't mixed.
7516          */
7517         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
7518                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
7519                 if (ret)
7520                         return ret;
7521         }
7522
7523         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
7524         if (!block_group)
7525                 return -EINVAL;
7526
7527         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7528                                           RESERVE_ALLOC_NO_ACCOUNT, 0);
7529         BUG_ON(ret); /* logic error */
7530         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7531                                          0, owner, offset, ins, 1);
7532         btrfs_put_block_group(block_group);
7533         return ret;
7534 }
7535
7536 static struct extent_buffer *
7537 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7538                       u64 bytenr, int level)
7539 {
7540         struct extent_buffer *buf;
7541
7542         buf = btrfs_find_create_tree_block(root, bytenr);
7543         if (!buf)
7544                 return ERR_PTR(-ENOMEM);
7545         btrfs_set_header_generation(buf, trans->transid);
7546         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7547         btrfs_tree_lock(buf);
7548         clean_tree_block(trans, root->fs_info, buf);
7549         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7550
7551         btrfs_set_lock_blocking(buf);
7552         btrfs_set_buffer_uptodate(buf);
7553
7554         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7555                 buf->log_index = root->log_transid % 2;
7556                 /*
7557                  * we allow two log transactions at a time, use different
7558                  * EXENT bit to differentiate dirty pages.
7559                  */
7560                 if (buf->log_index == 0)
7561                         set_extent_dirty(&root->dirty_log_pages, buf->start,
7562                                         buf->start + buf->len - 1, GFP_NOFS);
7563                 else
7564                         set_extent_new(&root->dirty_log_pages, buf->start,
7565                                         buf->start + buf->len - 1, GFP_NOFS);
7566         } else {
7567                 buf->log_index = -1;
7568                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7569                          buf->start + buf->len - 1, GFP_NOFS);
7570         }
7571         trans->blocks_used++;
7572         /* this returns a buffer locked for blocking */
7573         return buf;
7574 }
7575
7576 static struct btrfs_block_rsv *
7577 use_block_rsv(struct btrfs_trans_handle *trans,
7578               struct btrfs_root *root, u32 blocksize)
7579 {
7580         struct btrfs_block_rsv *block_rsv;
7581         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
7582         int ret;
7583         bool global_updated = false;
7584
7585         block_rsv = get_block_rsv(trans, root);
7586
7587         if (unlikely(block_rsv->size == 0))
7588                 goto try_reserve;
7589 again:
7590         ret = block_rsv_use_bytes(block_rsv, blocksize);
7591         if (!ret)
7592                 return block_rsv;
7593
7594         if (block_rsv->failfast)
7595                 return ERR_PTR(ret);
7596
7597         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7598                 global_updated = true;
7599                 update_global_block_rsv(root->fs_info);
7600                 goto again;
7601         }
7602
7603         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7604                 static DEFINE_RATELIMIT_STATE(_rs,
7605                                 DEFAULT_RATELIMIT_INTERVAL * 10,
7606                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
7607                 if (__ratelimit(&_rs))
7608                         WARN(1, KERN_DEBUG
7609                                 "BTRFS: block rsv returned %d\n", ret);
7610         }
7611 try_reserve:
7612         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
7613                                      BTRFS_RESERVE_NO_FLUSH);
7614         if (!ret)
7615                 return block_rsv;
7616         /*
7617          * If we couldn't reserve metadata bytes try and use some from
7618          * the global reserve if its space type is the same as the global
7619          * reservation.
7620          */
7621         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7622             block_rsv->space_info == global_rsv->space_info) {
7623                 ret = block_rsv_use_bytes(global_rsv, blocksize);
7624                 if (!ret)
7625                         return global_rsv;
7626         }
7627         return ERR_PTR(ret);
7628 }
7629
7630 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7631                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
7632 {
7633         block_rsv_add_bytes(block_rsv, blocksize, 0);
7634         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
7635 }
7636
7637 /*
7638  * finds a free extent and does all the dirty work required for allocation
7639  * returns the key for the extent through ins, and a tree buffer for
7640  * the first block of the extent through buf.
7641  *
7642  * returns the tree buffer or an ERR_PTR on error.
7643  */
7644 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7645                                         struct btrfs_root *root,
7646                                         u64 parent, u64 root_objectid,
7647                                         struct btrfs_disk_key *key, int level,
7648                                         u64 hint, u64 empty_size)
7649 {
7650         struct btrfs_key ins;
7651         struct btrfs_block_rsv *block_rsv;
7652         struct extent_buffer *buf;
7653         struct btrfs_delayed_extent_op *extent_op;
7654         u64 flags = 0;
7655         int ret;
7656         u32 blocksize = root->nodesize;
7657         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7658                                                  SKINNY_METADATA);
7659
7660         if (btrfs_test_is_dummy_root(root)) {
7661                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7662                                             level);
7663                 if (!IS_ERR(buf))
7664                         root->alloc_bytenr += blocksize;
7665                 return buf;
7666         }
7667
7668         block_rsv = use_block_rsv(trans, root, blocksize);
7669         if (IS_ERR(block_rsv))
7670                 return ERR_CAST(block_rsv);
7671
7672         ret = btrfs_reserve_extent(root, blocksize, blocksize,
7673                                    empty_size, hint, &ins, 0, 0);
7674         if (ret)
7675                 goto out_unuse;
7676
7677         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
7678         if (IS_ERR(buf)) {
7679                 ret = PTR_ERR(buf);
7680                 goto out_free_reserved;
7681         }
7682
7683         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7684                 if (parent == 0)
7685                         parent = ins.objectid;
7686                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7687         } else
7688                 BUG_ON(parent > 0);
7689
7690         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7691                 extent_op = btrfs_alloc_delayed_extent_op();
7692                 if (!extent_op) {
7693                         ret = -ENOMEM;
7694                         goto out_free_buf;
7695                 }
7696                 if (key)
7697                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
7698                 else
7699                         memset(&extent_op->key, 0, sizeof(extent_op->key));
7700                 extent_op->flags_to_set = flags;
7701                 if (skinny_metadata)
7702                         extent_op->update_key = 0;
7703                 else
7704                         extent_op->update_key = 1;
7705                 extent_op->update_flags = 1;
7706                 extent_op->is_data = 0;
7707                 extent_op->level = level;
7708
7709                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7710                                                  ins.objectid, ins.offset,
7711                                                  parent, root_objectid, level,
7712                                                  BTRFS_ADD_DELAYED_EXTENT,
7713                                                  extent_op, 0);
7714                 if (ret)
7715                         goto out_free_delayed;
7716         }
7717         return buf;
7718
7719 out_free_delayed:
7720         btrfs_free_delayed_extent_op(extent_op);
7721 out_free_buf:
7722         free_extent_buffer(buf);
7723 out_free_reserved:
7724         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
7725 out_unuse:
7726         unuse_block_rsv(root->fs_info, block_rsv, blocksize);
7727         return ERR_PTR(ret);
7728 }
7729
7730 struct walk_control {
7731         u64 refs[BTRFS_MAX_LEVEL];
7732         u64 flags[BTRFS_MAX_LEVEL];
7733         struct btrfs_key update_progress;
7734         int stage;
7735         int level;
7736         int shared_level;
7737         int update_ref;
7738         int keep_locks;
7739         int reada_slot;
7740         int reada_count;
7741         int for_reloc;
7742 };
7743
7744 #define DROP_REFERENCE  1
7745 #define UPDATE_BACKREF  2
7746
7747 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7748                                      struct btrfs_root *root,
7749                                      struct walk_control *wc,
7750                                      struct btrfs_path *path)
7751 {
7752         u64 bytenr;
7753         u64 generation;
7754         u64 refs;
7755         u64 flags;
7756         u32 nritems;
7757         u32 blocksize;
7758         struct btrfs_key key;
7759         struct extent_buffer *eb;
7760         int ret;
7761         int slot;
7762         int nread = 0;
7763
7764         if (path->slots[wc->level] < wc->reada_slot) {
7765                 wc->reada_count = wc->reada_count * 2 / 3;
7766                 wc->reada_count = max(wc->reada_count, 2);
7767         } else {
7768                 wc->reada_count = wc->reada_count * 3 / 2;
7769                 wc->reada_count = min_t(int, wc->reada_count,
7770                                         BTRFS_NODEPTRS_PER_BLOCK(root));
7771         }
7772
7773         eb = path->nodes[wc->level];
7774         nritems = btrfs_header_nritems(eb);
7775         blocksize = root->nodesize;
7776
7777         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7778                 if (nread >= wc->reada_count)
7779                         break;
7780
7781                 cond_resched();
7782                 bytenr = btrfs_node_blockptr(eb, slot);
7783                 generation = btrfs_node_ptr_generation(eb, slot);
7784
7785                 if (slot == path->slots[wc->level])
7786                         goto reada;
7787
7788                 if (wc->stage == UPDATE_BACKREF &&
7789                     generation <= root->root_key.offset)
7790                         continue;
7791
7792                 /* We don't lock the tree block, it's OK to be racy here */
7793                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
7794                                                wc->level - 1, 1, &refs,
7795                                                &flags);
7796                 /* We don't care about errors in readahead. */
7797                 if (ret < 0)
7798                         continue;
7799                 BUG_ON(refs == 0);
7800
7801                 if (wc->stage == DROP_REFERENCE) {
7802                         if (refs == 1)
7803                                 goto reada;
7804
7805                         if (wc->level == 1 &&
7806                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7807                                 continue;
7808                         if (!wc->update_ref ||
7809                             generation <= root->root_key.offset)
7810                                 continue;
7811                         btrfs_node_key_to_cpu(eb, &key, slot);
7812                         ret = btrfs_comp_cpu_keys(&key,
7813                                                   &wc->update_progress);
7814                         if (ret < 0)
7815                                 continue;
7816                 } else {
7817                         if (wc->level == 1 &&
7818                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7819                                 continue;
7820                 }
7821 reada:
7822                 readahead_tree_block(root, bytenr);
7823                 nread++;
7824         }
7825         wc->reada_slot = slot;
7826 }
7827
7828 /*
7829  * TODO: Modify related function to add related node/leaf to dirty_extent_root,
7830  * for later qgroup accounting.
7831  *
7832  * Current, this function does nothing.
7833  */
7834 static int account_leaf_items(struct btrfs_trans_handle *trans,
7835                               struct btrfs_root *root,
7836                               struct extent_buffer *eb)
7837 {
7838         int nr = btrfs_header_nritems(eb);
7839         int i, extent_type;
7840         struct btrfs_key key;
7841         struct btrfs_file_extent_item *fi;
7842         u64 bytenr, num_bytes;
7843
7844         for (i = 0; i < nr; i++) {
7845                 btrfs_item_key_to_cpu(eb, &key, i);
7846
7847                 if (key.type != BTRFS_EXTENT_DATA_KEY)
7848                         continue;
7849
7850                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7851                 /* filter out non qgroup-accountable extents  */
7852                 extent_type = btrfs_file_extent_type(eb, fi);
7853
7854                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7855                         continue;
7856
7857                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7858                 if (!bytenr)
7859                         continue;
7860
7861                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7862         }
7863         return 0;
7864 }
7865
7866 /*
7867  * Walk up the tree from the bottom, freeing leaves and any interior
7868  * nodes which have had all slots visited. If a node (leaf or
7869  * interior) is freed, the node above it will have it's slot
7870  * incremented. The root node will never be freed.
7871  *
7872  * At the end of this function, we should have a path which has all
7873  * slots incremented to the next position for a search. If we need to
7874  * read a new node it will be NULL and the node above it will have the
7875  * correct slot selected for a later read.
7876  *
7877  * If we increment the root nodes slot counter past the number of
7878  * elements, 1 is returned to signal completion of the search.
7879  */
7880 static int adjust_slots_upwards(struct btrfs_root *root,
7881                                 struct btrfs_path *path, int root_level)
7882 {
7883         int level = 0;
7884         int nr, slot;
7885         struct extent_buffer *eb;
7886
7887         if (root_level == 0)
7888                 return 1;
7889
7890         while (level <= root_level) {
7891                 eb = path->nodes[level];
7892                 nr = btrfs_header_nritems(eb);
7893                 path->slots[level]++;
7894                 slot = path->slots[level];
7895                 if (slot >= nr || level == 0) {
7896                         /*
7897                          * Don't free the root -  we will detect this
7898                          * condition after our loop and return a
7899                          * positive value for caller to stop walking the tree.
7900                          */
7901                         if (level != root_level) {
7902                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
7903                                 path->locks[level] = 0;
7904
7905                                 free_extent_buffer(eb);
7906                                 path->nodes[level] = NULL;
7907                                 path->slots[level] = 0;
7908                         }
7909                 } else {
7910                         /*
7911                          * We have a valid slot to walk back down
7912                          * from. Stop here so caller can process these
7913                          * new nodes.
7914                          */
7915                         break;
7916                 }
7917
7918                 level++;
7919         }
7920
7921         eb = path->nodes[root_level];
7922         if (path->slots[root_level] >= btrfs_header_nritems(eb))
7923                 return 1;
7924
7925         return 0;
7926 }
7927
7928 /*
7929  * root_eb is the subtree root and is locked before this function is called.
7930  * TODO: Modify this function to mark all (including complete shared node)
7931  * to dirty_extent_root to allow it get accounted in qgroup.
7932  */
7933 static int account_shared_subtree(struct btrfs_trans_handle *trans,
7934                                   struct btrfs_root *root,
7935                                   struct extent_buffer *root_eb,
7936                                   u64 root_gen,
7937                                   int root_level)
7938 {
7939         int ret = 0;
7940         int level;
7941         struct extent_buffer *eb = root_eb;
7942         struct btrfs_path *path = NULL;
7943
7944         BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7945         BUG_ON(root_eb == NULL);
7946
7947         if (!root->fs_info->quota_enabled)
7948                 return 0;
7949
7950         if (!extent_buffer_uptodate(root_eb)) {
7951                 ret = btrfs_read_buffer(root_eb, root_gen);
7952                 if (ret)
7953                         goto out;
7954         }
7955
7956         if (root_level == 0) {
7957                 ret = account_leaf_items(trans, root, root_eb);
7958                 goto out;
7959         }
7960
7961         path = btrfs_alloc_path();
7962         if (!path)
7963                 return -ENOMEM;
7964
7965         /*
7966          * Walk down the tree.  Missing extent blocks are filled in as
7967          * we go. Metadata is accounted every time we read a new
7968          * extent block.
7969          *
7970          * When we reach a leaf, we account for file extent items in it,
7971          * walk back up the tree (adjusting slot pointers as we go)
7972          * and restart the search process.
7973          */
7974         extent_buffer_get(root_eb); /* For path */
7975         path->nodes[root_level] = root_eb;
7976         path->slots[root_level] = 0;
7977         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
7978 walk_down:
7979         level = root_level;
7980         while (level >= 0) {
7981                 if (path->nodes[level] == NULL) {
7982                         int parent_slot;
7983                         u64 child_gen;
7984                         u64 child_bytenr;
7985
7986                         /* We need to get child blockptr/gen from
7987                          * parent before we can read it. */
7988                         eb = path->nodes[level + 1];
7989                         parent_slot = path->slots[level + 1];
7990                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7991                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7992
7993                         eb = read_tree_block(root, child_bytenr, child_gen);
7994                         if (IS_ERR(eb)) {
7995                                 ret = PTR_ERR(eb);
7996                                 goto out;
7997                         } else if (!extent_buffer_uptodate(eb)) {
7998                                 free_extent_buffer(eb);
7999                                 ret = -EIO;
8000                                 goto out;
8001                         }
8002
8003                         path->nodes[level] = eb;
8004                         path->slots[level] = 0;
8005
8006                         btrfs_tree_read_lock(eb);
8007                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8008                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8009                 }
8010
8011                 if (level == 0) {
8012                         ret = account_leaf_items(trans, root, path->nodes[level]);
8013                         if (ret)
8014                                 goto out;
8015
8016                         /* Nonzero return here means we completed our search */
8017                         ret = adjust_slots_upwards(root, path, root_level);
8018                         if (ret)
8019                                 break;
8020
8021                         /* Restart search with new slots */
8022                         goto walk_down;
8023                 }
8024
8025                 level--;
8026         }
8027
8028         ret = 0;
8029 out:
8030         btrfs_free_path(path);
8031
8032         return ret;
8033 }
8034
8035 /*
8036  * helper to process tree block while walking down the tree.
8037  *
8038  * when wc->stage == UPDATE_BACKREF, this function updates
8039  * back refs for pointers in the block.
8040  *
8041  * NOTE: return value 1 means we should stop walking down.
8042  */
8043 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8044                                    struct btrfs_root *root,
8045                                    struct btrfs_path *path,
8046                                    struct walk_control *wc, int lookup_info)
8047 {
8048         int level = wc->level;
8049         struct extent_buffer *eb = path->nodes[level];
8050         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8051         int ret;
8052
8053         if (wc->stage == UPDATE_BACKREF &&
8054             btrfs_header_owner(eb) != root->root_key.objectid)
8055                 return 1;
8056
8057         /*
8058          * when reference count of tree block is 1, it won't increase
8059          * again. once full backref flag is set, we never clear it.
8060          */
8061         if (lookup_info &&
8062             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8063              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8064                 BUG_ON(!path->locks[level]);
8065                 ret = btrfs_lookup_extent_info(trans, root,
8066                                                eb->start, level, 1,
8067                                                &wc->refs[level],
8068                                                &wc->flags[level]);
8069                 BUG_ON(ret == -ENOMEM);
8070                 if (ret)
8071                         return ret;
8072                 BUG_ON(wc->refs[level] == 0);
8073         }
8074
8075         if (wc->stage == DROP_REFERENCE) {
8076                 if (wc->refs[level] > 1)
8077                         return 1;
8078
8079                 if (path->locks[level] && !wc->keep_locks) {
8080                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8081                         path->locks[level] = 0;
8082                 }
8083                 return 0;
8084         }
8085
8086         /* wc->stage == UPDATE_BACKREF */
8087         if (!(wc->flags[level] & flag)) {
8088                 BUG_ON(!path->locks[level]);
8089                 ret = btrfs_inc_ref(trans, root, eb, 1);
8090                 BUG_ON(ret); /* -ENOMEM */
8091                 ret = btrfs_dec_ref(trans, root, eb, 0);
8092                 BUG_ON(ret); /* -ENOMEM */
8093                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8094                                                   eb->len, flag,
8095                                                   btrfs_header_level(eb), 0);
8096                 BUG_ON(ret); /* -ENOMEM */
8097                 wc->flags[level] |= flag;
8098         }
8099
8100         /*
8101          * the block is shared by multiple trees, so it's not good to
8102          * keep the tree lock
8103          */
8104         if (path->locks[level] && level > 0) {
8105                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8106                 path->locks[level] = 0;
8107         }
8108         return 0;
8109 }
8110
8111 /*
8112  * helper to process tree block pointer.
8113  *
8114  * when wc->stage == DROP_REFERENCE, this function checks
8115  * reference count of the block pointed to. if the block
8116  * is shared and we need update back refs for the subtree
8117  * rooted at the block, this function changes wc->stage to
8118  * UPDATE_BACKREF. if the block is shared and there is no
8119  * need to update back, this function drops the reference
8120  * to the block.
8121  *
8122  * NOTE: return value 1 means we should stop walking down.
8123  */
8124 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8125                                  struct btrfs_root *root,
8126                                  struct btrfs_path *path,
8127                                  struct walk_control *wc, int *lookup_info)
8128 {
8129         u64 bytenr;
8130         u64 generation;
8131         u64 parent;
8132         u32 blocksize;
8133         struct btrfs_key key;
8134         struct extent_buffer *next;
8135         int level = wc->level;
8136         int reada = 0;
8137         int ret = 0;
8138         bool need_account = false;
8139
8140         generation = btrfs_node_ptr_generation(path->nodes[level],
8141                                                path->slots[level]);
8142         /*
8143          * if the lower level block was created before the snapshot
8144          * was created, we know there is no need to update back refs
8145          * for the subtree
8146          */
8147         if (wc->stage == UPDATE_BACKREF &&
8148             generation <= root->root_key.offset) {
8149                 *lookup_info = 1;
8150                 return 1;
8151         }
8152
8153         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8154         blocksize = root->nodesize;
8155
8156         next = btrfs_find_tree_block(root->fs_info, bytenr);
8157         if (!next) {
8158                 next = btrfs_find_create_tree_block(root, bytenr);
8159                 if (!next)
8160                         return -ENOMEM;
8161                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8162                                                level - 1);
8163                 reada = 1;
8164         }
8165         btrfs_tree_lock(next);
8166         btrfs_set_lock_blocking(next);
8167
8168         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8169                                        &wc->refs[level - 1],
8170                                        &wc->flags[level - 1]);
8171         if (ret < 0) {
8172                 btrfs_tree_unlock(next);
8173                 return ret;
8174         }
8175
8176         if (unlikely(wc->refs[level - 1] == 0)) {
8177                 btrfs_err(root->fs_info, "Missing references.");
8178                 BUG();
8179         }
8180         *lookup_info = 0;
8181
8182         if (wc->stage == DROP_REFERENCE) {
8183                 if (wc->refs[level - 1] > 1) {
8184                         need_account = true;
8185                         if (level == 1 &&
8186                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8187                                 goto skip;
8188
8189                         if (!wc->update_ref ||
8190                             generation <= root->root_key.offset)
8191                                 goto skip;
8192
8193                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8194                                               path->slots[level]);
8195                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8196                         if (ret < 0)
8197                                 goto skip;
8198
8199                         wc->stage = UPDATE_BACKREF;
8200                         wc->shared_level = level - 1;
8201                 }
8202         } else {
8203                 if (level == 1 &&
8204                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8205                         goto skip;
8206         }
8207
8208         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8209                 btrfs_tree_unlock(next);
8210                 free_extent_buffer(next);
8211                 next = NULL;
8212                 *lookup_info = 1;
8213         }
8214
8215         if (!next) {
8216                 if (reada && level == 1)
8217                         reada_walk_down(trans, root, wc, path);
8218                 next = read_tree_block(root, bytenr, generation);
8219                 if (IS_ERR(next)) {
8220                         return PTR_ERR(next);
8221                 } else if (!extent_buffer_uptodate(next)) {
8222                         free_extent_buffer(next);
8223                         return -EIO;
8224                 }
8225                 btrfs_tree_lock(next);
8226                 btrfs_set_lock_blocking(next);
8227         }
8228
8229         level--;
8230         BUG_ON(level != btrfs_header_level(next));
8231         path->nodes[level] = next;
8232         path->slots[level] = 0;
8233         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8234         wc->level = level;
8235         if (wc->level == 1)
8236                 wc->reada_slot = 0;
8237         return 0;
8238 skip:
8239         wc->refs[level - 1] = 0;
8240         wc->flags[level - 1] = 0;
8241         if (wc->stage == DROP_REFERENCE) {
8242                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8243                         parent = path->nodes[level]->start;
8244                 } else {
8245                         BUG_ON(root->root_key.objectid !=
8246                                btrfs_header_owner(path->nodes[level]));
8247                         parent = 0;
8248                 }
8249
8250                 if (need_account) {
8251                         ret = account_shared_subtree(trans, root, next,
8252                                                      generation, level - 1);
8253                         if (ret) {
8254                                 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8255                                         "%d accounting shared subtree. Quota "
8256                                         "is out of sync, rescan required.\n",
8257                                         root->fs_info->sb->s_id, ret);
8258                         }
8259                 }
8260                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8261                                 root->root_key.objectid, level - 1, 0, 0);
8262                 BUG_ON(ret); /* -ENOMEM */
8263         }
8264         btrfs_tree_unlock(next);
8265         free_extent_buffer(next);
8266         *lookup_info = 1;
8267         return 1;
8268 }
8269
8270 /*
8271  * helper to process tree block while walking up the tree.
8272  *
8273  * when wc->stage == DROP_REFERENCE, this function drops
8274  * reference count on the block.
8275  *
8276  * when wc->stage == UPDATE_BACKREF, this function changes
8277  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8278  * to UPDATE_BACKREF previously while processing the block.
8279  *
8280  * NOTE: return value 1 means we should stop walking up.
8281  */
8282 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8283                                  struct btrfs_root *root,
8284                                  struct btrfs_path *path,
8285                                  struct walk_control *wc)
8286 {
8287         int ret;
8288         int level = wc->level;
8289         struct extent_buffer *eb = path->nodes[level];
8290         u64 parent = 0;
8291
8292         if (wc->stage == UPDATE_BACKREF) {
8293                 BUG_ON(wc->shared_level < level);
8294                 if (level < wc->shared_level)
8295                         goto out;
8296
8297                 ret = find_next_key(path, level + 1, &wc->update_progress);
8298                 if (ret > 0)
8299                         wc->update_ref = 0;
8300
8301                 wc->stage = DROP_REFERENCE;
8302                 wc->shared_level = -1;
8303                 path->slots[level] = 0;
8304
8305                 /*
8306                  * check reference count again if the block isn't locked.
8307                  * we should start walking down the tree again if reference
8308                  * count is one.
8309                  */
8310                 if (!path->locks[level]) {
8311                         BUG_ON(level == 0);
8312                         btrfs_tree_lock(eb);
8313                         btrfs_set_lock_blocking(eb);
8314                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8315
8316                         ret = btrfs_lookup_extent_info(trans, root,
8317                                                        eb->start, level, 1,
8318                                                        &wc->refs[level],
8319                                                        &wc->flags[level]);
8320                         if (ret < 0) {
8321                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8322                                 path->locks[level] = 0;
8323                                 return ret;
8324                         }
8325                         BUG_ON(wc->refs[level] == 0);
8326                         if (wc->refs[level] == 1) {
8327                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8328                                 path->locks[level] = 0;
8329                                 return 1;
8330                         }
8331                 }
8332         }
8333
8334         /* wc->stage == DROP_REFERENCE */
8335         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8336
8337         if (wc->refs[level] == 1) {
8338                 if (level == 0) {
8339                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8340                                 ret = btrfs_dec_ref(trans, root, eb, 1);
8341                         else
8342                                 ret = btrfs_dec_ref(trans, root, eb, 0);
8343                         BUG_ON(ret); /* -ENOMEM */
8344                         ret = account_leaf_items(trans, root, eb);
8345                         if (ret) {
8346                                 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8347                                         "%d accounting leaf items. Quota "
8348                                         "is out of sync, rescan required.\n",
8349                                         root->fs_info->sb->s_id, ret);
8350                         }
8351                 }
8352                 /* make block locked assertion in clean_tree_block happy */
8353                 if (!path->locks[level] &&
8354                     btrfs_header_generation(eb) == trans->transid) {
8355                         btrfs_tree_lock(eb);
8356                         btrfs_set_lock_blocking(eb);
8357                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8358                 }
8359                 clean_tree_block(trans, root->fs_info, eb);
8360         }
8361
8362         if (eb == root->node) {
8363                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8364                         parent = eb->start;
8365                 else
8366                         BUG_ON(root->root_key.objectid !=
8367                                btrfs_header_owner(eb));
8368         } else {
8369                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8370                         parent = path->nodes[level + 1]->start;
8371                 else
8372                         BUG_ON(root->root_key.objectid !=
8373                                btrfs_header_owner(path->nodes[level + 1]));
8374         }
8375
8376         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8377 out:
8378         wc->refs[level] = 0;
8379         wc->flags[level] = 0;
8380         return 0;
8381 }
8382
8383 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8384                                    struct btrfs_root *root,
8385                                    struct btrfs_path *path,
8386                                    struct walk_control *wc)
8387 {
8388         int level = wc->level;
8389         int lookup_info = 1;
8390         int ret;
8391
8392         while (level >= 0) {
8393                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8394                 if (ret > 0)
8395                         break;
8396
8397                 if (level == 0)
8398                         break;
8399
8400                 if (path->slots[level] >=
8401                     btrfs_header_nritems(path->nodes[level]))
8402                         break;
8403
8404                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8405                 if (ret > 0) {
8406                         path->slots[level]++;
8407                         continue;
8408                 } else if (ret < 0)
8409                         return ret;
8410                 level = wc->level;
8411         }
8412         return 0;
8413 }
8414
8415 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8416                                  struct btrfs_root *root,
8417                                  struct btrfs_path *path,
8418                                  struct walk_control *wc, int max_level)
8419 {
8420         int level = wc->level;
8421         int ret;
8422
8423         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8424         while (level < max_level && path->nodes[level]) {
8425                 wc->level = level;
8426                 if (path->slots[level] + 1 <
8427                     btrfs_header_nritems(path->nodes[level])) {
8428                         path->slots[level]++;
8429                         return 0;
8430                 } else {
8431                         ret = walk_up_proc(trans, root, path, wc);
8432                         if (ret > 0)
8433                                 return 0;
8434
8435                         if (path->locks[level]) {
8436                                 btrfs_tree_unlock_rw(path->nodes[level],
8437                                                      path->locks[level]);
8438                                 path->locks[level] = 0;
8439                         }
8440                         free_extent_buffer(path->nodes[level]);
8441                         path->nodes[level] = NULL;
8442                         level++;
8443                 }
8444         }
8445         return 1;
8446 }
8447
8448 /*
8449  * drop a subvolume tree.
8450  *
8451  * this function traverses the tree freeing any blocks that only
8452  * referenced by the tree.
8453  *
8454  * when a shared tree block is found. this function decreases its
8455  * reference count by one. if update_ref is true, this function
8456  * also make sure backrefs for the shared block and all lower level
8457  * blocks are properly updated.
8458  *
8459  * If called with for_reloc == 0, may exit early with -EAGAIN
8460  */
8461 int btrfs_drop_snapshot(struct btrfs_root *root,
8462                          struct btrfs_block_rsv *block_rsv, int update_ref,
8463                          int for_reloc)
8464 {
8465         struct btrfs_path *path;
8466         struct btrfs_trans_handle *trans;
8467         struct btrfs_root *tree_root = root->fs_info->tree_root;
8468         struct btrfs_root_item *root_item = &root->root_item;
8469         struct walk_control *wc;
8470         struct btrfs_key key;
8471         int err = 0;
8472         int ret;
8473         int level;
8474         bool root_dropped = false;
8475
8476         btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8477
8478         path = btrfs_alloc_path();
8479         if (!path) {
8480                 err = -ENOMEM;
8481                 goto out;
8482         }
8483
8484         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8485         if (!wc) {
8486                 btrfs_free_path(path);
8487                 err = -ENOMEM;
8488                 goto out;
8489         }
8490
8491         trans = btrfs_start_transaction(tree_root, 0);
8492         if (IS_ERR(trans)) {
8493                 err = PTR_ERR(trans);
8494                 goto out_free;
8495         }
8496
8497         if (block_rsv)
8498                 trans->block_rsv = block_rsv;
8499
8500         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8501                 level = btrfs_header_level(root->node);
8502                 path->nodes[level] = btrfs_lock_root_node(root);
8503                 btrfs_set_lock_blocking(path->nodes[level]);
8504                 path->slots[level] = 0;
8505                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8506                 memset(&wc->update_progress, 0,
8507                        sizeof(wc->update_progress));
8508         } else {
8509                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8510                 memcpy(&wc->update_progress, &key,
8511                        sizeof(wc->update_progress));
8512
8513                 level = root_item->drop_level;
8514                 BUG_ON(level == 0);
8515                 path->lowest_level = level;
8516                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8517                 path->lowest_level = 0;
8518                 if (ret < 0) {
8519                         err = ret;
8520                         goto out_end_trans;
8521                 }
8522                 WARN_ON(ret > 0);
8523
8524                 /*
8525                  * unlock our path, this is safe because only this
8526                  * function is allowed to delete this snapshot
8527                  */
8528                 btrfs_unlock_up_safe(path, 0);
8529
8530                 level = btrfs_header_level(root->node);
8531                 while (1) {
8532                         btrfs_tree_lock(path->nodes[level]);
8533                         btrfs_set_lock_blocking(path->nodes[level]);
8534                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8535
8536                         ret = btrfs_lookup_extent_info(trans, root,
8537                                                 path->nodes[level]->start,
8538                                                 level, 1, &wc->refs[level],
8539                                                 &wc->flags[level]);
8540                         if (ret < 0) {
8541                                 err = ret;
8542                                 goto out_end_trans;
8543                         }
8544                         BUG_ON(wc->refs[level] == 0);
8545
8546                         if (level == root_item->drop_level)
8547                                 break;
8548
8549                         btrfs_tree_unlock(path->nodes[level]);
8550                         path->locks[level] = 0;
8551                         WARN_ON(wc->refs[level] != 1);
8552                         level--;
8553                 }
8554         }
8555
8556         wc->level = level;
8557         wc->shared_level = -1;
8558         wc->stage = DROP_REFERENCE;
8559         wc->update_ref = update_ref;
8560         wc->keep_locks = 0;
8561         wc->for_reloc = for_reloc;
8562         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8563
8564         while (1) {
8565
8566                 ret = walk_down_tree(trans, root, path, wc);
8567                 if (ret < 0) {
8568                         err = ret;
8569                         break;
8570                 }
8571
8572                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8573                 if (ret < 0) {
8574                         err = ret;
8575                         break;
8576                 }
8577
8578                 if (ret > 0) {
8579                         BUG_ON(wc->stage != DROP_REFERENCE);
8580                         break;
8581                 }
8582
8583                 if (wc->stage == DROP_REFERENCE) {
8584                         level = wc->level;
8585                         btrfs_node_key(path->nodes[level],
8586                                        &root_item->drop_progress,
8587                                        path->slots[level]);
8588                         root_item->drop_level = level;
8589                 }
8590
8591                 BUG_ON(wc->level == 0);
8592                 if (btrfs_should_end_transaction(trans, tree_root) ||
8593                     (!for_reloc && btrfs_need_cleaner_sleep(root))) {
8594                         ret = btrfs_update_root(trans, tree_root,
8595                                                 &root->root_key,
8596                                                 root_item);
8597                         if (ret) {
8598                                 btrfs_abort_transaction(trans, tree_root, ret);
8599                                 err = ret;
8600                                 goto out_end_trans;
8601                         }
8602
8603                         btrfs_end_transaction_throttle(trans, tree_root);
8604                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8605                                 pr_debug("BTRFS: drop snapshot early exit\n");
8606                                 err = -EAGAIN;
8607                                 goto out_free;
8608                         }
8609
8610                         trans = btrfs_start_transaction(tree_root, 0);
8611                         if (IS_ERR(trans)) {
8612                                 err = PTR_ERR(trans);
8613                                 goto out_free;
8614                         }
8615                         if (block_rsv)
8616                                 trans->block_rsv = block_rsv;
8617                 }
8618         }
8619         btrfs_release_path(path);
8620         if (err)
8621                 goto out_end_trans;
8622
8623         ret = btrfs_del_root(trans, tree_root, &root->root_key);
8624         if (ret) {
8625                 btrfs_abort_transaction(trans, tree_root, ret);
8626                 goto out_end_trans;
8627         }
8628
8629         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8630                 ret = btrfs_find_root(tree_root, &root->root_key, path,
8631                                       NULL, NULL);
8632                 if (ret < 0) {
8633                         btrfs_abort_transaction(trans, tree_root, ret);
8634                         err = ret;
8635                         goto out_end_trans;
8636                 } else if (ret > 0) {
8637                         /* if we fail to delete the orphan item this time
8638                          * around, it'll get picked up the next time.
8639                          *
8640                          * The most common failure here is just -ENOENT.
8641                          */
8642                         btrfs_del_orphan_item(trans, tree_root,
8643                                               root->root_key.objectid);
8644                 }
8645         }
8646
8647         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
8648                 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
8649         } else {
8650                 free_extent_buffer(root->node);
8651                 free_extent_buffer(root->commit_root);
8652                 btrfs_put_fs_root(root);
8653         }
8654         root_dropped = true;
8655 out_end_trans:
8656         btrfs_end_transaction_throttle(trans, tree_root);
8657 out_free:
8658         kfree(wc);
8659         btrfs_free_path(path);
8660 out:
8661         /*
8662          * So if we need to stop dropping the snapshot for whatever reason we
8663          * need to make sure to add it back to the dead root list so that we
8664          * keep trying to do the work later.  This also cleans up roots if we
8665          * don't have it in the radix (like when we recover after a power fail
8666          * or unmount) so we don't leak memory.
8667          */
8668         if (!for_reloc && root_dropped == false)
8669                 btrfs_add_dead_root(root);
8670         if (err && err != -EAGAIN)
8671                 btrfs_std_error(root->fs_info, err);
8672         return err;
8673 }
8674
8675 /*
8676  * drop subtree rooted at tree block 'node'.
8677  *
8678  * NOTE: this function will unlock and release tree block 'node'
8679  * only used by relocation code
8680  */
8681 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
8682                         struct btrfs_root *root,
8683                         struct extent_buffer *node,
8684                         struct extent_buffer *parent)
8685 {
8686         struct btrfs_path *path;
8687         struct walk_control *wc;
8688         int level;
8689         int parent_level;
8690         int ret = 0;
8691         int wret;
8692
8693         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
8694
8695         path = btrfs_alloc_path();
8696         if (!path)
8697                 return -ENOMEM;
8698
8699         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8700         if (!wc) {
8701                 btrfs_free_path(path);
8702                 return -ENOMEM;
8703         }
8704
8705         btrfs_assert_tree_locked(parent);
8706         parent_level = btrfs_header_level(parent);
8707         extent_buffer_get(parent);
8708         path->nodes[parent_level] = parent;
8709         path->slots[parent_level] = btrfs_header_nritems(parent);
8710
8711         btrfs_assert_tree_locked(node);
8712         level = btrfs_header_level(node);
8713         path->nodes[level] = node;
8714         path->slots[level] = 0;
8715         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8716
8717         wc->refs[parent_level] = 1;
8718         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8719         wc->level = level;
8720         wc->shared_level = -1;
8721         wc->stage = DROP_REFERENCE;
8722         wc->update_ref = 0;
8723         wc->keep_locks = 1;
8724         wc->for_reloc = 1;
8725         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8726
8727         while (1) {
8728                 wret = walk_down_tree(trans, root, path, wc);
8729                 if (wret < 0) {
8730                         ret = wret;
8731                         break;
8732                 }
8733
8734                 wret = walk_up_tree(trans, root, path, wc, parent_level);
8735                 if (wret < 0)
8736                         ret = wret;
8737                 if (wret != 0)
8738                         break;
8739         }
8740
8741         kfree(wc);
8742         btrfs_free_path(path);
8743         return ret;
8744 }
8745
8746 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8747 {
8748         u64 num_devices;
8749         u64 stripped;
8750
8751         /*
8752          * if restripe for this chunk_type is on pick target profile and
8753          * return, otherwise do the usual balance
8754          */
8755         stripped = get_restripe_target(root->fs_info, flags);
8756         if (stripped)
8757                 return extended_to_chunk(stripped);
8758
8759         num_devices = root->fs_info->fs_devices->rw_devices;
8760
8761         stripped = BTRFS_BLOCK_GROUP_RAID0 |
8762                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
8763                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
8764
8765         if (num_devices == 1) {
8766                 stripped |= BTRFS_BLOCK_GROUP_DUP;
8767                 stripped = flags & ~stripped;
8768
8769                 /* turn raid0 into single device chunks */
8770                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
8771                         return stripped;
8772
8773                 /* turn mirroring into duplication */
8774                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
8775                              BTRFS_BLOCK_GROUP_RAID10))
8776                         return stripped | BTRFS_BLOCK_GROUP_DUP;
8777         } else {
8778                 /* they already had raid on here, just return */
8779                 if (flags & stripped)
8780                         return flags;
8781
8782                 stripped |= BTRFS_BLOCK_GROUP_DUP;
8783                 stripped = flags & ~stripped;
8784
8785                 /* switch duplicated blocks with raid1 */
8786                 if (flags & BTRFS_BLOCK_GROUP_DUP)
8787                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
8788
8789                 /* this is drive concat, leave it alone */
8790         }
8791
8792         return flags;
8793 }
8794
8795 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8796 {
8797         struct btrfs_space_info *sinfo = cache->space_info;
8798         u64 num_bytes;
8799         u64 min_allocable_bytes;
8800         int ret = -ENOSPC;
8801
8802
8803         /*
8804          * We need some metadata space and system metadata space for
8805          * allocating chunks in some corner cases until we force to set
8806          * it to be readonly.
8807          */
8808         if ((sinfo->flags &
8809              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
8810             !force)
8811                 min_allocable_bytes = 1 * 1024 * 1024;
8812         else
8813                 min_allocable_bytes = 0;
8814
8815         spin_lock(&sinfo->lock);
8816         spin_lock(&cache->lock);
8817
8818         if (cache->ro) {
8819                 ret = 0;
8820                 goto out;
8821         }
8822
8823         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8824                     cache->bytes_super - btrfs_block_group_used(&cache->item);
8825
8826         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
8827             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
8828             min_allocable_bytes <= sinfo->total_bytes) {
8829                 sinfo->bytes_readonly += num_bytes;
8830                 cache->ro = 1;
8831                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8832                 ret = 0;
8833         }
8834 out:
8835         spin_unlock(&cache->lock);
8836         spin_unlock(&sinfo->lock);
8837         return ret;
8838 }
8839
8840 int btrfs_set_block_group_ro(struct btrfs_root *root,
8841                              struct btrfs_block_group_cache *cache)
8842
8843 {
8844         struct btrfs_trans_handle *trans;
8845         u64 alloc_flags;
8846         int ret;
8847
8848         BUG_ON(cache->ro);
8849
8850 again:
8851         trans = btrfs_join_transaction(root);
8852         if (IS_ERR(trans))
8853                 return PTR_ERR(trans);
8854
8855         /*
8856          * we're not allowed to set block groups readonly after the dirty
8857          * block groups cache has started writing.  If it already started,
8858          * back off and let this transaction commit
8859          */
8860         mutex_lock(&root->fs_info->ro_block_group_mutex);
8861         if (trans->transaction->dirty_bg_run) {
8862                 u64 transid = trans->transid;
8863
8864                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8865                 btrfs_end_transaction(trans, root);
8866
8867                 ret = btrfs_wait_for_commit(root, transid);
8868                 if (ret)
8869                         return ret;
8870                 goto again;
8871         }
8872
8873         /*
8874          * if we are changing raid levels, try to allocate a corresponding
8875          * block group with the new raid level.
8876          */
8877         alloc_flags = update_block_group_flags(root, cache->flags);
8878         if (alloc_flags != cache->flags) {
8879                 ret = do_chunk_alloc(trans, root, alloc_flags,
8880                                      CHUNK_ALLOC_FORCE);
8881                 /*
8882                  * ENOSPC is allowed here, we may have enough space
8883                  * already allocated at the new raid level to
8884                  * carry on
8885                  */
8886                 if (ret == -ENOSPC)
8887                         ret = 0;
8888                 if (ret < 0)
8889                         goto out;
8890         }
8891
8892         ret = set_block_group_ro(cache, 0);
8893         if (!ret)
8894                 goto out;
8895         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8896         ret = do_chunk_alloc(trans, root, alloc_flags,
8897                              CHUNK_ALLOC_FORCE);
8898         if (ret < 0)
8899                 goto out;
8900         ret = set_block_group_ro(cache, 0);
8901 out:
8902         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
8903                 alloc_flags = update_block_group_flags(root, cache->flags);
8904                 lock_chunks(root->fs_info->chunk_root);
8905                 check_system_chunk(trans, root, alloc_flags);
8906                 unlock_chunks(root->fs_info->chunk_root);
8907         }
8908         mutex_unlock(&root->fs_info->ro_block_group_mutex);
8909
8910         btrfs_end_transaction(trans, root);
8911         return ret;
8912 }
8913
8914 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8915                             struct btrfs_root *root, u64 type)
8916 {
8917         u64 alloc_flags = get_alloc_profile(root, type);
8918         return do_chunk_alloc(trans, root, alloc_flags,
8919                               CHUNK_ALLOC_FORCE);
8920 }
8921
8922 /*
8923  * helper to account the unused space of all the readonly block group in the
8924  * space_info. takes mirrors into account.
8925  */
8926 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8927 {
8928         struct btrfs_block_group_cache *block_group;
8929         u64 free_bytes = 0;
8930         int factor;
8931
8932         /* It's df, we don't care if it's racey */
8933         if (list_empty(&sinfo->ro_bgs))
8934                 return 0;
8935
8936         spin_lock(&sinfo->lock);
8937         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8938                 spin_lock(&block_group->lock);
8939
8940                 if (!block_group->ro) {
8941                         spin_unlock(&block_group->lock);
8942                         continue;
8943                 }
8944
8945                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8946                                           BTRFS_BLOCK_GROUP_RAID10 |
8947                                           BTRFS_BLOCK_GROUP_DUP))
8948                         factor = 2;
8949                 else
8950                         factor = 1;
8951
8952                 free_bytes += (block_group->key.offset -
8953                                btrfs_block_group_used(&block_group->item)) *
8954                                factor;
8955
8956                 spin_unlock(&block_group->lock);
8957         }
8958         spin_unlock(&sinfo->lock);
8959
8960         return free_bytes;
8961 }
8962
8963 void btrfs_set_block_group_rw(struct btrfs_root *root,
8964                               struct btrfs_block_group_cache *cache)
8965 {
8966         struct btrfs_space_info *sinfo = cache->space_info;
8967         u64 num_bytes;
8968
8969         BUG_ON(!cache->ro);
8970
8971         spin_lock(&sinfo->lock);
8972         spin_lock(&cache->lock);
8973         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8974                     cache->bytes_super - btrfs_block_group_used(&cache->item);
8975         sinfo->bytes_readonly -= num_bytes;
8976         cache->ro = 0;
8977         list_del_init(&cache->ro_list);
8978         spin_unlock(&cache->lock);
8979         spin_unlock(&sinfo->lock);
8980 }
8981
8982 /*
8983  * checks to see if its even possible to relocate this block group.
8984  *
8985  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8986  * ok to go ahead and try.
8987  */
8988 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8989 {
8990         struct btrfs_block_group_cache *block_group;
8991         struct btrfs_space_info *space_info;
8992         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
8993         struct btrfs_device *device;
8994         struct btrfs_trans_handle *trans;
8995         u64 min_free;
8996         u64 dev_min = 1;
8997         u64 dev_nr = 0;
8998         u64 target;
8999         int index;
9000         int full = 0;
9001         int ret = 0;
9002
9003         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9004
9005         /* odd, couldn't find the block group, leave it alone */
9006         if (!block_group)
9007                 return -1;
9008
9009         min_free = btrfs_block_group_used(&block_group->item);
9010
9011         /* no bytes used, we're good */
9012         if (!min_free)
9013                 goto out;
9014
9015         space_info = block_group->space_info;
9016         spin_lock(&space_info->lock);
9017
9018         full = space_info->full;
9019
9020         /*
9021          * if this is the last block group we have in this space, we can't
9022          * relocate it unless we're able to allocate a new chunk below.
9023          *
9024          * Otherwise, we need to make sure we have room in the space to handle
9025          * all of the extents from this block group.  If we can, we're good
9026          */
9027         if ((space_info->total_bytes != block_group->key.offset) &&
9028             (space_info->bytes_used + space_info->bytes_reserved +
9029              space_info->bytes_pinned + space_info->bytes_readonly +
9030              min_free < space_info->total_bytes)) {
9031                 spin_unlock(&space_info->lock);
9032                 goto out;
9033         }
9034         spin_unlock(&space_info->lock);
9035
9036         /*
9037          * ok we don't have enough space, but maybe we have free space on our
9038          * devices to allocate new chunks for relocation, so loop through our
9039          * alloc devices and guess if we have enough space.  if this block
9040          * group is going to be restriped, run checks against the target
9041          * profile instead of the current one.
9042          */
9043         ret = -1;
9044
9045         /*
9046          * index:
9047          *      0: raid10
9048          *      1: raid1
9049          *      2: dup
9050          *      3: raid0
9051          *      4: single
9052          */
9053         target = get_restripe_target(root->fs_info, block_group->flags);
9054         if (target) {
9055                 index = __get_raid_index(extended_to_chunk(target));
9056         } else {
9057                 /*
9058                  * this is just a balance, so if we were marked as full
9059                  * we know there is no space for a new chunk
9060                  */
9061                 if (full)
9062                         goto out;
9063
9064                 index = get_block_group_index(block_group);
9065         }
9066
9067         if (index == BTRFS_RAID_RAID10) {
9068                 dev_min = 4;
9069                 /* Divide by 2 */
9070                 min_free >>= 1;
9071         } else if (index == BTRFS_RAID_RAID1) {
9072                 dev_min = 2;
9073         } else if (index == BTRFS_RAID_DUP) {
9074                 /* Multiply by 2 */
9075                 min_free <<= 1;
9076         } else if (index == BTRFS_RAID_RAID0) {
9077                 dev_min = fs_devices->rw_devices;
9078                 min_free = div64_u64(min_free, dev_min);
9079         }
9080
9081         /* We need to do this so that we can look at pending chunks */
9082         trans = btrfs_join_transaction(root);
9083         if (IS_ERR(trans)) {
9084                 ret = PTR_ERR(trans);
9085                 goto out;
9086         }
9087
9088         mutex_lock(&root->fs_info->chunk_mutex);
9089         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9090                 u64 dev_offset;
9091
9092                 /*
9093                  * check to make sure we can actually find a chunk with enough
9094                  * space to fit our block group in.
9095                  */
9096                 if (device->total_bytes > device->bytes_used + min_free &&
9097                     !device->is_tgtdev_for_dev_replace) {
9098                         ret = find_free_dev_extent(trans, device, min_free,
9099                                                    &dev_offset, NULL);
9100                         if (!ret)
9101                                 dev_nr++;
9102
9103                         if (dev_nr >= dev_min)
9104                                 break;
9105
9106                         ret = -1;
9107                 }
9108         }
9109         mutex_unlock(&root->fs_info->chunk_mutex);
9110         btrfs_end_transaction(trans, root);
9111 out:
9112         btrfs_put_block_group(block_group);
9113         return ret;
9114 }
9115
9116 static int find_first_block_group(struct btrfs_root *root,
9117                 struct btrfs_path *path, struct btrfs_key *key)
9118 {
9119         int ret = 0;
9120         struct btrfs_key found_key;
9121         struct extent_buffer *leaf;
9122         int slot;
9123
9124         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9125         if (ret < 0)
9126                 goto out;
9127
9128         while (1) {
9129                 slot = path->slots[0];
9130                 leaf = path->nodes[0];
9131                 if (slot >= btrfs_header_nritems(leaf)) {
9132                         ret = btrfs_next_leaf(root, path);
9133                         if (ret == 0)
9134                                 continue;
9135                         if (ret < 0)
9136                                 goto out;
9137                         break;
9138                 }
9139                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9140
9141                 if (found_key.objectid >= key->objectid &&
9142                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9143                         ret = 0;
9144                         goto out;
9145                 }
9146                 path->slots[0]++;
9147         }
9148 out:
9149         return ret;
9150 }
9151
9152 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9153 {
9154         struct btrfs_block_group_cache *block_group;
9155         u64 last = 0;
9156
9157         while (1) {
9158                 struct inode *inode;
9159
9160                 block_group = btrfs_lookup_first_block_group(info, last);
9161                 while (block_group) {
9162                         spin_lock(&block_group->lock);
9163                         if (block_group->iref)
9164                                 break;
9165                         spin_unlock(&block_group->lock);
9166                         block_group = next_block_group(info->tree_root,
9167                                                        block_group);
9168                 }
9169                 if (!block_group) {
9170                         if (last == 0)
9171                                 break;
9172                         last = 0;
9173                         continue;
9174                 }
9175
9176                 inode = block_group->inode;
9177                 block_group->iref = 0;
9178                 block_group->inode = NULL;
9179                 spin_unlock(&block_group->lock);
9180                 iput(inode);
9181                 last = block_group->key.objectid + block_group->key.offset;
9182                 btrfs_put_block_group(block_group);
9183         }
9184 }
9185
9186 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9187 {
9188         struct btrfs_block_group_cache *block_group;
9189         struct btrfs_space_info *space_info;
9190         struct btrfs_caching_control *caching_ctl;
9191         struct rb_node *n;
9192
9193         down_write(&info->commit_root_sem);
9194         while (!list_empty(&info->caching_block_groups)) {
9195                 caching_ctl = list_entry(info->caching_block_groups.next,
9196                                          struct btrfs_caching_control, list);
9197                 list_del(&caching_ctl->list);
9198                 put_caching_control(caching_ctl);
9199         }
9200         up_write(&info->commit_root_sem);
9201
9202         spin_lock(&info->unused_bgs_lock);
9203         while (!list_empty(&info->unused_bgs)) {
9204                 block_group = list_first_entry(&info->unused_bgs,
9205                                                struct btrfs_block_group_cache,
9206                                                bg_list);
9207                 list_del_init(&block_group->bg_list);
9208                 btrfs_put_block_group(block_group);
9209         }
9210         spin_unlock(&info->unused_bgs_lock);
9211
9212         spin_lock(&info->block_group_cache_lock);
9213         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9214                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9215                                        cache_node);
9216                 rb_erase(&block_group->cache_node,
9217                          &info->block_group_cache_tree);
9218                 RB_CLEAR_NODE(&block_group->cache_node);
9219                 spin_unlock(&info->block_group_cache_lock);
9220
9221                 down_write(&block_group->space_info->groups_sem);
9222                 list_del(&block_group->list);
9223                 up_write(&block_group->space_info->groups_sem);
9224
9225                 if (block_group->cached == BTRFS_CACHE_STARTED)
9226                         wait_block_group_cache_done(block_group);
9227
9228                 /*
9229                  * We haven't cached this block group, which means we could
9230                  * possibly have excluded extents on this block group.
9231                  */
9232                 if (block_group->cached == BTRFS_CACHE_NO ||
9233                     block_group->cached == BTRFS_CACHE_ERROR)
9234                         free_excluded_extents(info->extent_root, block_group);
9235
9236                 btrfs_remove_free_space_cache(block_group);
9237                 btrfs_put_block_group(block_group);
9238
9239                 spin_lock(&info->block_group_cache_lock);
9240         }
9241         spin_unlock(&info->block_group_cache_lock);
9242
9243         /* now that all the block groups are freed, go through and
9244          * free all the space_info structs.  This is only called during
9245          * the final stages of unmount, and so we know nobody is
9246          * using them.  We call synchronize_rcu() once before we start,
9247          * just to be on the safe side.
9248          */
9249         synchronize_rcu();
9250
9251         release_global_block_rsv(info);
9252
9253         while (!list_empty(&info->space_info)) {
9254                 int i;
9255
9256                 space_info = list_entry(info->space_info.next,
9257                                         struct btrfs_space_info,
9258                                         list);
9259                 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
9260                         if (WARN_ON(space_info->bytes_pinned > 0 ||
9261                             space_info->bytes_reserved > 0 ||
9262                             space_info->bytes_may_use > 0)) {
9263                                 dump_space_info(space_info, 0, 0);
9264                         }
9265                 }
9266                 list_del(&space_info->list);
9267                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9268                         struct kobject *kobj;
9269                         kobj = space_info->block_group_kobjs[i];
9270                         space_info->block_group_kobjs[i] = NULL;
9271                         if (kobj) {
9272                                 kobject_del(kobj);
9273                                 kobject_put(kobj);
9274                         }
9275                 }
9276                 kobject_del(&space_info->kobj);
9277                 kobject_put(&space_info->kobj);
9278         }
9279         return 0;
9280 }
9281
9282 static void __link_block_group(struct btrfs_space_info *space_info,
9283                                struct btrfs_block_group_cache *cache)
9284 {
9285         int index = get_block_group_index(cache);
9286         bool first = false;
9287
9288         down_write(&space_info->groups_sem);
9289         if (list_empty(&space_info->block_groups[index]))
9290                 first = true;
9291         list_add_tail(&cache->list, &space_info->block_groups[index]);
9292         up_write(&space_info->groups_sem);
9293
9294         if (first) {
9295                 struct raid_kobject *rkobj;
9296                 int ret;
9297
9298                 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9299                 if (!rkobj)
9300                         goto out_err;
9301                 rkobj->raid_type = index;
9302                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9303                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9304                                   "%s", get_raid_name(index));
9305                 if (ret) {
9306                         kobject_put(&rkobj->kobj);
9307                         goto out_err;
9308                 }
9309                 space_info->block_group_kobjs[index] = &rkobj->kobj;
9310         }
9311
9312         return;
9313 out_err:
9314         pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
9315 }
9316
9317 static struct btrfs_block_group_cache *
9318 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9319 {
9320         struct btrfs_block_group_cache *cache;
9321
9322         cache = kzalloc(sizeof(*cache), GFP_NOFS);
9323         if (!cache)
9324                 return NULL;
9325
9326         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9327                                         GFP_NOFS);
9328         if (!cache->free_space_ctl) {
9329                 kfree(cache);
9330                 return NULL;
9331         }
9332
9333         cache->key.objectid = start;
9334         cache->key.offset = size;
9335         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9336
9337         cache->sectorsize = root->sectorsize;
9338         cache->fs_info = root->fs_info;
9339         cache->full_stripe_len = btrfs_full_stripe_len(root,
9340                                                &root->fs_info->mapping_tree,
9341                                                start);
9342         atomic_set(&cache->count, 1);
9343         spin_lock_init(&cache->lock);
9344         init_rwsem(&cache->data_rwsem);
9345         INIT_LIST_HEAD(&cache->list);
9346         INIT_LIST_HEAD(&cache->cluster_list);
9347         INIT_LIST_HEAD(&cache->bg_list);
9348         INIT_LIST_HEAD(&cache->ro_list);
9349         INIT_LIST_HEAD(&cache->dirty_list);
9350         INIT_LIST_HEAD(&cache->io_list);
9351         btrfs_init_free_space_ctl(cache);
9352         atomic_set(&cache->trimming, 0);
9353
9354         return cache;
9355 }
9356
9357 int btrfs_read_block_groups(struct btrfs_root *root)
9358 {
9359         struct btrfs_path *path;
9360         int ret;
9361         struct btrfs_block_group_cache *cache;
9362         struct btrfs_fs_info *info = root->fs_info;
9363         struct btrfs_space_info *space_info;
9364         struct btrfs_key key;
9365         struct btrfs_key found_key;
9366         struct extent_buffer *leaf;
9367         int need_clear = 0;
9368         u64 cache_gen;
9369
9370         root = info->extent_root;
9371         key.objectid = 0;
9372         key.offset = 0;
9373         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9374         path = btrfs_alloc_path();
9375         if (!path)
9376                 return -ENOMEM;
9377         path->reada = 1;
9378
9379         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
9380         if (btrfs_test_opt(root, SPACE_CACHE) &&
9381             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
9382                 need_clear = 1;
9383         if (btrfs_test_opt(root, CLEAR_CACHE))
9384                 need_clear = 1;
9385
9386         while (1) {
9387                 ret = find_first_block_group(root, path, &key);
9388                 if (ret > 0)
9389                         break;
9390                 if (ret != 0)
9391                         goto error;
9392
9393                 leaf = path->nodes[0];
9394                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9395
9396                 cache = btrfs_create_block_group_cache(root, found_key.objectid,
9397                                                        found_key.offset);
9398                 if (!cache) {
9399                         ret = -ENOMEM;
9400                         goto error;
9401                 }
9402
9403                 if (need_clear) {
9404                         /*
9405                          * When we mount with old space cache, we need to
9406                          * set BTRFS_DC_CLEAR and set dirty flag.
9407                          *
9408                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9409                          *    truncate the old free space cache inode and
9410                          *    setup a new one.
9411                          * b) Setting 'dirty flag' makes sure that we flush
9412                          *    the new space cache info onto disk.
9413                          */
9414                         if (btrfs_test_opt(root, SPACE_CACHE))
9415                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
9416                 }
9417
9418                 read_extent_buffer(leaf, &cache->item,
9419                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
9420                                    sizeof(cache->item));
9421                 cache->flags = btrfs_block_group_flags(&cache->item);
9422
9423                 key.objectid = found_key.objectid + found_key.offset;
9424                 btrfs_release_path(path);
9425
9426                 /*
9427                  * We need to exclude the super stripes now so that the space
9428                  * info has super bytes accounted for, otherwise we'll think
9429                  * we have more space than we actually do.
9430                  */
9431                 ret = exclude_super_stripes(root, cache);
9432                 if (ret) {
9433                         /*
9434                          * We may have excluded something, so call this just in
9435                          * case.
9436                          */
9437                         free_excluded_extents(root, cache);
9438                         btrfs_put_block_group(cache);
9439                         goto error;
9440                 }
9441
9442                 /*
9443                  * check for two cases, either we are full, and therefore
9444                  * don't need to bother with the caching work since we won't
9445                  * find any space, or we are empty, and we can just add all
9446                  * the space in and be done with it.  This saves us _alot_ of
9447                  * time, particularly in the full case.
9448                  */
9449                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
9450                         cache->last_byte_to_unpin = (u64)-1;
9451                         cache->cached = BTRFS_CACHE_FINISHED;
9452                         free_excluded_extents(root, cache);
9453                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9454                         cache->last_byte_to_unpin = (u64)-1;
9455                         cache->cached = BTRFS_CACHE_FINISHED;
9456                         add_new_free_space(cache, root->fs_info,
9457                                            found_key.objectid,
9458                                            found_key.objectid +
9459                                            found_key.offset);
9460                         free_excluded_extents(root, cache);
9461                 }
9462
9463                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
9464                 if (ret) {
9465                         btrfs_remove_free_space_cache(cache);
9466                         btrfs_put_block_group(cache);
9467                         goto error;
9468                 }
9469
9470                 ret = update_space_info(info, cache->flags, found_key.offset,
9471                                         btrfs_block_group_used(&cache->item),
9472                                         &space_info);
9473                 if (ret) {
9474                         btrfs_remove_free_space_cache(cache);
9475                         spin_lock(&info->block_group_cache_lock);
9476                         rb_erase(&cache->cache_node,
9477                                  &info->block_group_cache_tree);
9478                         RB_CLEAR_NODE(&cache->cache_node);
9479                         spin_unlock(&info->block_group_cache_lock);
9480                         btrfs_put_block_group(cache);
9481                         goto error;
9482                 }
9483
9484                 cache->space_info = space_info;
9485                 spin_lock(&cache->space_info->lock);
9486                 cache->space_info->bytes_readonly += cache->bytes_super;
9487                 spin_unlock(&cache->space_info->lock);
9488
9489                 __link_block_group(space_info, cache);
9490
9491                 set_avail_alloc_bits(root->fs_info, cache->flags);
9492                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9493                         set_block_group_ro(cache, 1);
9494                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9495                         spin_lock(&info->unused_bgs_lock);
9496                         /* Should always be true but just in case. */
9497                         if (list_empty(&cache->bg_list)) {
9498                                 btrfs_get_block_group(cache);
9499                                 list_add_tail(&cache->bg_list,
9500                                               &info->unused_bgs);
9501                         }
9502                         spin_unlock(&info->unused_bgs_lock);
9503                 }
9504         }
9505
9506         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
9507                 if (!(get_alloc_profile(root, space_info->flags) &
9508                       (BTRFS_BLOCK_GROUP_RAID10 |
9509                        BTRFS_BLOCK_GROUP_RAID1 |
9510                        BTRFS_BLOCK_GROUP_RAID5 |
9511                        BTRFS_BLOCK_GROUP_RAID6 |
9512                        BTRFS_BLOCK_GROUP_DUP)))
9513                         continue;
9514                 /*
9515                  * avoid allocating from un-mirrored block group if there are
9516                  * mirrored block groups.
9517                  */
9518                 list_for_each_entry(cache,
9519                                 &space_info->block_groups[BTRFS_RAID_RAID0],
9520                                 list)
9521                         set_block_group_ro(cache, 1);
9522                 list_for_each_entry(cache,
9523                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
9524                                 list)
9525                         set_block_group_ro(cache, 1);
9526         }
9527
9528         init_global_block_rsv(info);
9529         ret = 0;
9530 error:
9531         btrfs_free_path(path);
9532         return ret;
9533 }
9534
9535 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9536                                        struct btrfs_root *root)
9537 {
9538         struct btrfs_block_group_cache *block_group, *tmp;
9539         struct btrfs_root *extent_root = root->fs_info->extent_root;
9540         struct btrfs_block_group_item item;
9541         struct btrfs_key key;
9542         int ret = 0;
9543
9544         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9545                 if (ret)
9546                         goto next;
9547
9548                 spin_lock(&block_group->lock);
9549                 memcpy(&item, &block_group->item, sizeof(item));
9550                 memcpy(&key, &block_group->key, sizeof(key));
9551                 spin_unlock(&block_group->lock);
9552
9553                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
9554                                         sizeof(item));
9555                 if (ret)
9556                         btrfs_abort_transaction(trans, extent_root, ret);
9557                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
9558                                                key.objectid, key.offset);
9559                 if (ret)
9560                         btrfs_abort_transaction(trans, extent_root, ret);
9561 next:
9562                 list_del_init(&block_group->bg_list);
9563         }
9564 }
9565
9566 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9567                            struct btrfs_root *root, u64 bytes_used,
9568                            u64 type, u64 chunk_objectid, u64 chunk_offset,
9569                            u64 size)
9570 {
9571         int ret;
9572         struct btrfs_root *extent_root;
9573         struct btrfs_block_group_cache *cache;
9574
9575         extent_root = root->fs_info->extent_root;
9576
9577         btrfs_set_log_full_commit(root->fs_info, trans);
9578
9579         cache = btrfs_create_block_group_cache(root, chunk_offset, size);
9580         if (!cache)
9581                 return -ENOMEM;
9582
9583         btrfs_set_block_group_used(&cache->item, bytes_used);
9584         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
9585         btrfs_set_block_group_flags(&cache->item, type);
9586
9587         cache->flags = type;
9588         cache->last_byte_to_unpin = (u64)-1;
9589         cache->cached = BTRFS_CACHE_FINISHED;
9590         ret = exclude_super_stripes(root, cache);
9591         if (ret) {
9592                 /*
9593                  * We may have excluded something, so call this just in
9594                  * case.
9595                  */
9596                 free_excluded_extents(root, cache);
9597                 btrfs_put_block_group(cache);
9598                 return ret;
9599         }
9600
9601         add_new_free_space(cache, root->fs_info, chunk_offset,
9602                            chunk_offset + size);
9603
9604         free_excluded_extents(root, cache);
9605
9606         /*
9607          * Call to ensure the corresponding space_info object is created and
9608          * assigned to our block group, but don't update its counters just yet.
9609          * We want our bg to be added to the rbtree with its ->space_info set.
9610          */
9611         ret = update_space_info(root->fs_info, cache->flags, 0, 0,
9612                                 &cache->space_info);
9613         if (ret) {
9614                 btrfs_remove_free_space_cache(cache);
9615                 btrfs_put_block_group(cache);
9616                 return ret;
9617         }
9618
9619         ret = btrfs_add_block_group_cache(root->fs_info, cache);
9620         if (ret) {
9621                 btrfs_remove_free_space_cache(cache);
9622                 btrfs_put_block_group(cache);
9623                 return ret;
9624         }
9625
9626         /*
9627          * Now that our block group has its ->space_info set and is inserted in
9628          * the rbtree, update the space info's counters.
9629          */
9630         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
9631                                 &cache->space_info);
9632         if (ret) {
9633                 btrfs_remove_free_space_cache(cache);
9634                 spin_lock(&root->fs_info->block_group_cache_lock);
9635                 rb_erase(&cache->cache_node,
9636                          &root->fs_info->block_group_cache_tree);
9637                 RB_CLEAR_NODE(&cache->cache_node);
9638                 spin_unlock(&root->fs_info->block_group_cache_lock);
9639                 btrfs_put_block_group(cache);
9640                 return ret;
9641         }
9642         update_global_block_rsv(root->fs_info);
9643
9644         spin_lock(&cache->space_info->lock);
9645         cache->space_info->bytes_readonly += cache->bytes_super;
9646         spin_unlock(&cache->space_info->lock);
9647
9648         __link_block_group(cache->space_info, cache);
9649
9650         list_add_tail(&cache->bg_list, &trans->new_bgs);
9651
9652         set_avail_alloc_bits(extent_root->fs_info, type);
9653
9654         return 0;
9655 }
9656
9657 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9658 {
9659         u64 extra_flags = chunk_to_extended(flags) &
9660                                 BTRFS_EXTENDED_PROFILE_MASK;
9661
9662         write_seqlock(&fs_info->profiles_lock);
9663         if (flags & BTRFS_BLOCK_GROUP_DATA)
9664                 fs_info->avail_data_alloc_bits &= ~extra_flags;
9665         if (flags & BTRFS_BLOCK_GROUP_METADATA)
9666                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
9667         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
9668                 fs_info->avail_system_alloc_bits &= ~extra_flags;
9669         write_sequnlock(&fs_info->profiles_lock);
9670 }
9671
9672 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9673                              struct btrfs_root *root, u64 group_start,
9674                              struct extent_map *em)
9675 {
9676         struct btrfs_path *path;
9677         struct btrfs_block_group_cache *block_group;
9678         struct btrfs_free_cluster *cluster;
9679         struct btrfs_root *tree_root = root->fs_info->tree_root;
9680         struct btrfs_key key;
9681         struct inode *inode;
9682         struct kobject *kobj = NULL;
9683         int ret;
9684         int index;
9685         int factor;
9686         struct btrfs_caching_control *caching_ctl = NULL;
9687         bool remove_em;
9688
9689         root = root->fs_info->extent_root;
9690
9691         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
9692         BUG_ON(!block_group);
9693         BUG_ON(!block_group->ro);
9694
9695         /*
9696          * Free the reserved super bytes from this block group before
9697          * remove it.
9698          */
9699         free_excluded_extents(root, block_group);
9700
9701         memcpy(&key, &block_group->key, sizeof(key));
9702         index = get_block_group_index(block_group);
9703         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
9704                                   BTRFS_BLOCK_GROUP_RAID1 |
9705                                   BTRFS_BLOCK_GROUP_RAID10))
9706                 factor = 2;
9707         else
9708                 factor = 1;
9709
9710         /* make sure this block group isn't part of an allocation cluster */
9711         cluster = &root->fs_info->data_alloc_cluster;
9712         spin_lock(&cluster->refill_lock);
9713         btrfs_return_cluster_to_free_space(block_group, cluster);
9714         spin_unlock(&cluster->refill_lock);
9715
9716         /*
9717          * make sure this block group isn't part of a metadata
9718          * allocation cluster
9719          */
9720         cluster = &root->fs_info->meta_alloc_cluster;
9721         spin_lock(&cluster->refill_lock);
9722         btrfs_return_cluster_to_free_space(block_group, cluster);
9723         spin_unlock(&cluster->refill_lock);
9724
9725         path = btrfs_alloc_path();
9726         if (!path) {
9727                 ret = -ENOMEM;
9728                 goto out;
9729         }
9730
9731         /*
9732          * get the inode first so any iput calls done for the io_list
9733          * aren't the final iput (no unlinks allowed now)
9734          */
9735         inode = lookup_free_space_inode(tree_root, block_group, path);
9736
9737         mutex_lock(&trans->transaction->cache_write_mutex);
9738         /*
9739          * make sure our free spache cache IO is done before remove the
9740          * free space inode
9741          */
9742         spin_lock(&trans->transaction->dirty_bgs_lock);
9743         if (!list_empty(&block_group->io_list)) {
9744                 list_del_init(&block_group->io_list);
9745
9746                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9747
9748                 spin_unlock(&trans->transaction->dirty_bgs_lock);
9749                 btrfs_wait_cache_io(root, trans, block_group,
9750                                     &block_group->io_ctl, path,
9751                                     block_group->key.objectid);
9752                 btrfs_put_block_group(block_group);
9753                 spin_lock(&trans->transaction->dirty_bgs_lock);
9754         }
9755
9756         if (!list_empty(&block_group->dirty_list)) {
9757                 list_del_init(&block_group->dirty_list);
9758                 btrfs_put_block_group(block_group);
9759         }
9760         spin_unlock(&trans->transaction->dirty_bgs_lock);
9761         mutex_unlock(&trans->transaction->cache_write_mutex);
9762
9763         if (!IS_ERR(inode)) {
9764                 ret = btrfs_orphan_add(trans, inode);
9765                 if (ret) {
9766                         btrfs_add_delayed_iput(inode);
9767                         goto out;
9768                 }
9769                 clear_nlink(inode);
9770                 /* One for the block groups ref */
9771                 spin_lock(&block_group->lock);
9772                 if (block_group->iref) {
9773                         block_group->iref = 0;
9774                         block_group->inode = NULL;
9775                         spin_unlock(&block_group->lock);
9776                         iput(inode);
9777                 } else {
9778                         spin_unlock(&block_group->lock);
9779                 }
9780                 /* One for our lookup ref */
9781                 btrfs_add_delayed_iput(inode);
9782         }
9783
9784         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9785         key.offset = block_group->key.objectid;
9786         key.type = 0;
9787
9788         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9789         if (ret < 0)
9790                 goto out;
9791         if (ret > 0)
9792                 btrfs_release_path(path);
9793         if (ret == 0) {
9794                 ret = btrfs_del_item(trans, tree_root, path);
9795                 if (ret)
9796                         goto out;
9797                 btrfs_release_path(path);
9798         }
9799
9800         spin_lock(&root->fs_info->block_group_cache_lock);
9801         rb_erase(&block_group->cache_node,
9802                  &root->fs_info->block_group_cache_tree);
9803         RB_CLEAR_NODE(&block_group->cache_node);
9804
9805         if (root->fs_info->first_logical_byte == block_group->key.objectid)
9806                 root->fs_info->first_logical_byte = (u64)-1;
9807         spin_unlock(&root->fs_info->block_group_cache_lock);
9808
9809         down_write(&block_group->space_info->groups_sem);
9810         /*
9811          * we must use list_del_init so people can check to see if they
9812          * are still on the list after taking the semaphore
9813          */
9814         list_del_init(&block_group->list);
9815         if (list_empty(&block_group->space_info->block_groups[index])) {
9816                 kobj = block_group->space_info->block_group_kobjs[index];
9817                 block_group->space_info->block_group_kobjs[index] = NULL;
9818                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
9819         }
9820         up_write(&block_group->space_info->groups_sem);
9821         if (kobj) {
9822                 kobject_del(kobj);
9823                 kobject_put(kobj);
9824         }
9825
9826         if (block_group->has_caching_ctl)
9827                 caching_ctl = get_caching_control(block_group);
9828         if (block_group->cached == BTRFS_CACHE_STARTED)
9829                 wait_block_group_cache_done(block_group);
9830         if (block_group->has_caching_ctl) {
9831                 down_write(&root->fs_info->commit_root_sem);
9832                 if (!caching_ctl) {
9833                         struct btrfs_caching_control *ctl;
9834
9835                         list_for_each_entry(ctl,
9836                                     &root->fs_info->caching_block_groups, list)
9837                                 if (ctl->block_group == block_group) {
9838                                         caching_ctl = ctl;
9839                                         atomic_inc(&caching_ctl->count);
9840                                         break;
9841                                 }
9842                 }
9843                 if (caching_ctl)
9844                         list_del_init(&caching_ctl->list);
9845                 up_write(&root->fs_info->commit_root_sem);
9846                 if (caching_ctl) {
9847                         /* Once for the caching bgs list and once for us. */
9848                         put_caching_control(caching_ctl);
9849                         put_caching_control(caching_ctl);
9850                 }
9851         }
9852
9853         spin_lock(&trans->transaction->dirty_bgs_lock);
9854         if (!list_empty(&block_group->dirty_list)) {
9855                 WARN_ON(1);
9856         }
9857         if (!list_empty(&block_group->io_list)) {
9858                 WARN_ON(1);
9859         }
9860         spin_unlock(&trans->transaction->dirty_bgs_lock);
9861         btrfs_remove_free_space_cache(block_group);
9862
9863         spin_lock(&block_group->space_info->lock);
9864         list_del_init(&block_group->ro_list);
9865
9866         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
9867                 WARN_ON(block_group->space_info->total_bytes
9868                         < block_group->key.offset);
9869                 WARN_ON(block_group->space_info->bytes_readonly
9870                         < block_group->key.offset);
9871                 WARN_ON(block_group->space_info->disk_total
9872                         < block_group->key.offset * factor);
9873         }
9874         block_group->space_info->total_bytes -= block_group->key.offset;
9875         block_group->space_info->bytes_readonly -= block_group->key.offset;
9876         block_group->space_info->disk_total -= block_group->key.offset * factor;
9877
9878         spin_unlock(&block_group->space_info->lock);
9879
9880         memcpy(&key, &block_group->key, sizeof(key));
9881
9882         lock_chunks(root);
9883         if (!list_empty(&em->list)) {
9884                 /* We're in the transaction->pending_chunks list. */
9885                 free_extent_map(em);
9886         }
9887         spin_lock(&block_group->lock);
9888         block_group->removed = 1;
9889         /*
9890          * At this point trimming can't start on this block group, because we
9891          * removed the block group from the tree fs_info->block_group_cache_tree
9892          * so no one can't find it anymore and even if someone already got this
9893          * block group before we removed it from the rbtree, they have already
9894          * incremented block_group->trimming - if they didn't, they won't find
9895          * any free space entries because we already removed them all when we
9896          * called btrfs_remove_free_space_cache().
9897          *
9898          * And we must not remove the extent map from the fs_info->mapping_tree
9899          * to prevent the same logical address range and physical device space
9900          * ranges from being reused for a new block group. This is because our
9901          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9902          * completely transactionless, so while it is trimming a range the
9903          * currently running transaction might finish and a new one start,
9904          * allowing for new block groups to be created that can reuse the same
9905          * physical device locations unless we take this special care.
9906          */
9907         remove_em = (atomic_read(&block_group->trimming) == 0);
9908         /*
9909          * Make sure a trimmer task always sees the em in the pinned_chunks list
9910          * if it sees block_group->removed == 1 (needs to lock block_group->lock
9911          * before checking block_group->removed).
9912          */
9913         if (!remove_em) {
9914                 /*
9915                  * Our em might be in trans->transaction->pending_chunks which
9916                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9917                  * and so is the fs_info->pinned_chunks list.
9918                  *
9919                  * So at this point we must be holding the chunk_mutex to avoid
9920                  * any races with chunk allocation (more specifically at
9921                  * volumes.c:contains_pending_extent()), to ensure it always
9922                  * sees the em, either in the pending_chunks list or in the
9923                  * pinned_chunks list.
9924                  */
9925                 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9926         }
9927         spin_unlock(&block_group->lock);
9928
9929         if (remove_em) {
9930                 struct extent_map_tree *em_tree;
9931
9932                 em_tree = &root->fs_info->mapping_tree.map_tree;
9933                 write_lock(&em_tree->lock);
9934                 /*
9935                  * The em might be in the pending_chunks list, so make sure the
9936                  * chunk mutex is locked, since remove_extent_mapping() will
9937                  * delete us from that list.
9938                  */
9939                 remove_extent_mapping(em_tree, em);
9940                 write_unlock(&em_tree->lock);
9941                 /* once for the tree */
9942                 free_extent_map(em);
9943         }
9944
9945         unlock_chunks(root);
9946
9947         btrfs_put_block_group(block_group);
9948         btrfs_put_block_group(block_group);
9949
9950         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9951         if (ret > 0)
9952                 ret = -EIO;
9953         if (ret < 0)
9954                 goto out;
9955
9956         ret = btrfs_del_item(trans, root, path);
9957 out:
9958         btrfs_free_path(path);
9959         return ret;
9960 }
9961
9962 /*
9963  * Process the unused_bgs list and remove any that don't have any allocated
9964  * space inside of them.
9965  */
9966 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9967 {
9968         struct btrfs_block_group_cache *block_group;
9969         struct btrfs_space_info *space_info;
9970         struct btrfs_root *root = fs_info->extent_root;
9971         struct btrfs_trans_handle *trans;
9972         int ret = 0;
9973
9974         if (!fs_info->open)
9975                 return;
9976
9977         spin_lock(&fs_info->unused_bgs_lock);
9978         while (!list_empty(&fs_info->unused_bgs)) {
9979                 u64 start, end;
9980
9981                 block_group = list_first_entry(&fs_info->unused_bgs,
9982                                                struct btrfs_block_group_cache,
9983                                                bg_list);
9984                 space_info = block_group->space_info;
9985                 list_del_init(&block_group->bg_list);
9986                 if (ret || btrfs_mixed_space_info(space_info)) {
9987                         btrfs_put_block_group(block_group);
9988                         continue;
9989                 }
9990                 spin_unlock(&fs_info->unused_bgs_lock);
9991
9992                 mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
9993
9994                 /* Don't want to race with allocators so take the groups_sem */
9995                 down_write(&space_info->groups_sem);
9996                 spin_lock(&block_group->lock);
9997                 if (block_group->reserved ||
9998                     btrfs_block_group_used(&block_group->item) ||
9999                     block_group->ro) {
10000                         /*
10001                          * We want to bail if we made new allocations or have
10002                          * outstanding allocations in this block group.  We do
10003                          * the ro check in case balance is currently acting on
10004                          * this block group.
10005                          */
10006                         spin_unlock(&block_group->lock);
10007                         up_write(&space_info->groups_sem);
10008                         goto next;
10009                 }
10010                 spin_unlock(&block_group->lock);
10011
10012                 /* We don't want to force the issue, only flip if it's ok. */
10013                 ret = set_block_group_ro(block_group, 0);
10014                 up_write(&space_info->groups_sem);
10015                 if (ret < 0) {
10016                         ret = 0;
10017                         goto next;
10018                 }
10019
10020                 /*
10021                  * Want to do this before we do anything else so we can recover
10022                  * properly if we fail to join the transaction.
10023                  */
10024                 /* 1 for btrfs_orphan_reserve_metadata() */
10025                 trans = btrfs_start_transaction(root, 1);
10026                 if (IS_ERR(trans)) {
10027                         btrfs_set_block_group_rw(root, block_group);
10028                         ret = PTR_ERR(trans);
10029                         goto next;
10030                 }
10031
10032                 /*
10033                  * We could have pending pinned extents for this block group,
10034                  * just delete them, we don't care about them anymore.
10035                  */
10036                 start = block_group->key.objectid;
10037                 end = start + block_group->key.offset - 1;
10038                 /*
10039                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10040                  * btrfs_finish_extent_commit(). If we are at transaction N,
10041                  * another task might be running finish_extent_commit() for the
10042                  * previous transaction N - 1, and have seen a range belonging
10043                  * to the block group in freed_extents[] before we were able to
10044                  * clear the whole block group range from freed_extents[]. This
10045                  * means that task can lookup for the block group after we
10046                  * unpinned it from freed_extents[] and removed it, leading to
10047                  * a BUG_ON() at btrfs_unpin_extent_range().
10048                  */
10049                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10050                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10051                                   EXTENT_DIRTY, GFP_NOFS);
10052                 if (ret) {
10053                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10054                         btrfs_set_block_group_rw(root, block_group);
10055                         goto end_trans;
10056                 }
10057                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10058                                   EXTENT_DIRTY, GFP_NOFS);
10059                 if (ret) {
10060                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10061                         btrfs_set_block_group_rw(root, block_group);
10062                         goto end_trans;
10063                 }
10064                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10065
10066                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10067                 spin_lock(&space_info->lock);
10068                 spin_lock(&block_group->lock);
10069
10070                 space_info->bytes_pinned -= block_group->pinned;
10071                 space_info->bytes_readonly += block_group->pinned;
10072                 percpu_counter_add(&space_info->total_bytes_pinned,
10073                                    -block_group->pinned);
10074                 block_group->pinned = 0;
10075
10076                 spin_unlock(&block_group->lock);
10077                 spin_unlock(&space_info->lock);
10078
10079                 /*
10080                  * Btrfs_remove_chunk will abort the transaction if things go
10081                  * horribly wrong.
10082                  */
10083                 ret = btrfs_remove_chunk(trans, root,
10084                                          block_group->key.objectid);
10085 end_trans:
10086                 btrfs_end_transaction(trans, root);
10087 next:
10088                 mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
10089                 btrfs_put_block_group(block_group);
10090                 spin_lock(&fs_info->unused_bgs_lock);
10091         }
10092         spin_unlock(&fs_info->unused_bgs_lock);
10093 }
10094
10095 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10096 {
10097         struct btrfs_space_info *space_info;
10098         struct btrfs_super_block *disk_super;
10099         u64 features;
10100         u64 flags;
10101         int mixed = 0;
10102         int ret;
10103
10104         disk_super = fs_info->super_copy;
10105         if (!btrfs_super_root(disk_super))
10106                 return 1;
10107
10108         features = btrfs_super_incompat_flags(disk_super);
10109         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10110                 mixed = 1;
10111
10112         flags = BTRFS_BLOCK_GROUP_SYSTEM;
10113         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10114         if (ret)
10115                 goto out;
10116
10117         if (mixed) {
10118                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10119                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10120         } else {
10121                 flags = BTRFS_BLOCK_GROUP_METADATA;
10122                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10123                 if (ret)
10124                         goto out;
10125
10126                 flags = BTRFS_BLOCK_GROUP_DATA;
10127                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10128         }
10129 out:
10130         return ret;
10131 }
10132
10133 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
10134 {
10135         return unpin_extent_range(root, start, end, false);
10136 }
10137
10138 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
10139 {
10140         struct btrfs_fs_info *fs_info = root->fs_info;
10141         struct btrfs_block_group_cache *cache = NULL;
10142         u64 group_trimmed;
10143         u64 start;
10144         u64 end;
10145         u64 trimmed = 0;
10146         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
10147         int ret = 0;
10148
10149         /*
10150          * try to trim all FS space, our block group may start from non-zero.
10151          */
10152         if (range->len == total_bytes)
10153                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10154         else
10155                 cache = btrfs_lookup_block_group(fs_info, range->start);
10156
10157         while (cache) {
10158                 if (cache->key.objectid >= (range->start + range->len)) {
10159                         btrfs_put_block_group(cache);
10160                         break;
10161                 }
10162
10163                 start = max(range->start, cache->key.objectid);
10164                 end = min(range->start + range->len,
10165                                 cache->key.objectid + cache->key.offset);
10166
10167                 if (end - start >= range->minlen) {
10168                         if (!block_group_cache_done(cache)) {
10169                                 ret = cache_block_group(cache, 0);
10170                                 if (ret) {
10171                                         btrfs_put_block_group(cache);
10172                                         break;
10173                                 }
10174                                 ret = wait_block_group_cache_done(cache);
10175                                 if (ret) {
10176                                         btrfs_put_block_group(cache);
10177                                         break;
10178                                 }
10179                         }
10180                         ret = btrfs_trim_block_group(cache,
10181                                                      &group_trimmed,
10182                                                      start,
10183                                                      end,
10184                                                      range->minlen);
10185
10186                         trimmed += group_trimmed;
10187                         if (ret) {
10188                                 btrfs_put_block_group(cache);
10189                                 break;
10190                         }
10191                 }
10192
10193                 cache = next_block_group(fs_info->tree_root, cache);
10194         }
10195
10196         range->len = trimmed;
10197         return ret;
10198 }
10199
10200 /*
10201  * btrfs_{start,end}_write_no_snapshoting() are similar to
10202  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10203  * data into the page cache through nocow before the subvolume is snapshoted,
10204  * but flush the data into disk after the snapshot creation, or to prevent
10205  * operations while snapshoting is ongoing and that cause the snapshot to be
10206  * inconsistent (writes followed by expanding truncates for example).
10207  */
10208 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
10209 {
10210         percpu_counter_dec(&root->subv_writers->counter);
10211         /*
10212          * Make sure counter is updated before we wake up
10213          * waiters.
10214          */
10215         smp_mb();
10216         if (waitqueue_active(&root->subv_writers->wait))
10217                 wake_up(&root->subv_writers->wait);
10218 }
10219
10220 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
10221 {
10222         if (atomic_read(&root->will_be_snapshoted))
10223                 return 0;
10224
10225         percpu_counter_inc(&root->subv_writers->counter);
10226         /*
10227          * Make sure counter is updated before we check for snapshot creation.
10228          */
10229         smp_mb();
10230         if (atomic_read(&root->will_be_snapshoted)) {
10231                 btrfs_end_write_no_snapshoting(root);
10232                 return 0;
10233         }
10234         return 1;
10235 }