btrfs: make btrfs_issue_discard return bytes discarded
[cascardo/linux.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/percpu_counter.h>
28 #include "hash.h"
29 #include "tree-log.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "volumes.h"
33 #include "raid56.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 #include "math.h"
37 #include "sysfs.h"
38 #include "qgroup.h"
39
40 #undef SCRAMBLE_DELAYED_REFS
41
42 /*
43  * control flags for do_chunk_alloc's force field
44  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
45  * if we really need one.
46  *
47  * CHUNK_ALLOC_LIMITED means to only try and allocate one
48  * if we have very few chunks already allocated.  This is
49  * used as part of the clustering code to help make sure
50  * we have a good pool of storage to cluster in, without
51  * filling the FS with empty chunks
52  *
53  * CHUNK_ALLOC_FORCE means it must try to allocate one
54  *
55  */
56 enum {
57         CHUNK_ALLOC_NO_FORCE = 0,
58         CHUNK_ALLOC_LIMITED = 1,
59         CHUNK_ALLOC_FORCE = 2,
60 };
61
62 /*
63  * Control how reservations are dealt with.
64  *
65  * RESERVE_FREE - freeing a reservation.
66  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
67  *   ENOSPC accounting
68  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
69  *   bytes_may_use as the ENOSPC accounting is done elsewhere
70  */
71 enum {
72         RESERVE_FREE = 0,
73         RESERVE_ALLOC = 1,
74         RESERVE_ALLOC_NO_ACCOUNT = 2,
75 };
76
77 static int update_block_group(struct btrfs_trans_handle *trans,
78                               struct btrfs_root *root, u64 bytenr,
79                               u64 num_bytes, int alloc);
80 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
81                                 struct btrfs_root *root,
82                                 struct btrfs_delayed_ref_node *node, u64 parent,
83                                 u64 root_objectid, u64 owner_objectid,
84                                 u64 owner_offset, int refs_to_drop,
85                                 struct btrfs_delayed_extent_op *extra_op);
86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
87                                     struct extent_buffer *leaf,
88                                     struct btrfs_extent_item *ei);
89 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
90                                       struct btrfs_root *root,
91                                       u64 parent, u64 root_objectid,
92                                       u64 flags, u64 owner, u64 offset,
93                                       struct btrfs_key *ins, int ref_mod);
94 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
95                                      struct btrfs_root *root,
96                                      u64 parent, u64 root_objectid,
97                                      u64 flags, struct btrfs_disk_key *key,
98                                      int level, struct btrfs_key *ins,
99                                      int no_quota);
100 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
101                           struct btrfs_root *extent_root, u64 flags,
102                           int force);
103 static int find_next_key(struct btrfs_path *path, int level,
104                          struct btrfs_key *key);
105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
106                             int dump_block_groups);
107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
108                                        u64 num_bytes, int reserve,
109                                        int delalloc);
110 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
111                                u64 num_bytes);
112 int btrfs_pin_extent(struct btrfs_root *root,
113                      u64 bytenr, u64 num_bytes, int reserved);
114
115 static noinline int
116 block_group_cache_done(struct btrfs_block_group_cache *cache)
117 {
118         smp_mb();
119         return cache->cached == BTRFS_CACHE_FINISHED ||
120                 cache->cached == BTRFS_CACHE_ERROR;
121 }
122
123 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
124 {
125         return (cache->flags & bits) == bits;
126 }
127
128 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
129 {
130         atomic_inc(&cache->count);
131 }
132
133 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
134 {
135         if (atomic_dec_and_test(&cache->count)) {
136                 WARN_ON(cache->pinned > 0);
137                 WARN_ON(cache->reserved > 0);
138                 kfree(cache->free_space_ctl);
139                 kfree(cache);
140         }
141 }
142
143 /*
144  * this adds the block group to the fs_info rb tree for the block group
145  * cache
146  */
147 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
148                                 struct btrfs_block_group_cache *block_group)
149 {
150         struct rb_node **p;
151         struct rb_node *parent = NULL;
152         struct btrfs_block_group_cache *cache;
153
154         spin_lock(&info->block_group_cache_lock);
155         p = &info->block_group_cache_tree.rb_node;
156
157         while (*p) {
158                 parent = *p;
159                 cache = rb_entry(parent, struct btrfs_block_group_cache,
160                                  cache_node);
161                 if (block_group->key.objectid < cache->key.objectid) {
162                         p = &(*p)->rb_left;
163                 } else if (block_group->key.objectid > cache->key.objectid) {
164                         p = &(*p)->rb_right;
165                 } else {
166                         spin_unlock(&info->block_group_cache_lock);
167                         return -EEXIST;
168                 }
169         }
170
171         rb_link_node(&block_group->cache_node, parent, p);
172         rb_insert_color(&block_group->cache_node,
173                         &info->block_group_cache_tree);
174
175         if (info->first_logical_byte > block_group->key.objectid)
176                 info->first_logical_byte = block_group->key.objectid;
177
178         spin_unlock(&info->block_group_cache_lock);
179
180         return 0;
181 }
182
183 /*
184  * This will return the block group at or after bytenr if contains is 0, else
185  * it will return the block group that contains the bytenr
186  */
187 static struct btrfs_block_group_cache *
188 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
189                               int contains)
190 {
191         struct btrfs_block_group_cache *cache, *ret = NULL;
192         struct rb_node *n;
193         u64 end, start;
194
195         spin_lock(&info->block_group_cache_lock);
196         n = info->block_group_cache_tree.rb_node;
197
198         while (n) {
199                 cache = rb_entry(n, struct btrfs_block_group_cache,
200                                  cache_node);
201                 end = cache->key.objectid + cache->key.offset - 1;
202                 start = cache->key.objectid;
203
204                 if (bytenr < start) {
205                         if (!contains && (!ret || start < ret->key.objectid))
206                                 ret = cache;
207                         n = n->rb_left;
208                 } else if (bytenr > start) {
209                         if (contains && bytenr <= end) {
210                                 ret = cache;
211                                 break;
212                         }
213                         n = n->rb_right;
214                 } else {
215                         ret = cache;
216                         break;
217                 }
218         }
219         if (ret) {
220                 btrfs_get_block_group(ret);
221                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
222                         info->first_logical_byte = ret->key.objectid;
223         }
224         spin_unlock(&info->block_group_cache_lock);
225
226         return ret;
227 }
228
229 static int add_excluded_extent(struct btrfs_root *root,
230                                u64 start, u64 num_bytes)
231 {
232         u64 end = start + num_bytes - 1;
233         set_extent_bits(&root->fs_info->freed_extents[0],
234                         start, end, EXTENT_UPTODATE, GFP_NOFS);
235         set_extent_bits(&root->fs_info->freed_extents[1],
236                         start, end, EXTENT_UPTODATE, GFP_NOFS);
237         return 0;
238 }
239
240 static void free_excluded_extents(struct btrfs_root *root,
241                                   struct btrfs_block_group_cache *cache)
242 {
243         u64 start, end;
244
245         start = cache->key.objectid;
246         end = start + cache->key.offset - 1;
247
248         clear_extent_bits(&root->fs_info->freed_extents[0],
249                           start, end, EXTENT_UPTODATE, GFP_NOFS);
250         clear_extent_bits(&root->fs_info->freed_extents[1],
251                           start, end, EXTENT_UPTODATE, GFP_NOFS);
252 }
253
254 static int exclude_super_stripes(struct btrfs_root *root,
255                                  struct btrfs_block_group_cache *cache)
256 {
257         u64 bytenr;
258         u64 *logical;
259         int stripe_len;
260         int i, nr, ret;
261
262         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
263                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
264                 cache->bytes_super += stripe_len;
265                 ret = add_excluded_extent(root, cache->key.objectid,
266                                           stripe_len);
267                 if (ret)
268                         return ret;
269         }
270
271         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
272                 bytenr = btrfs_sb_offset(i);
273                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
274                                        cache->key.objectid, bytenr,
275                                        0, &logical, &nr, &stripe_len);
276                 if (ret)
277                         return ret;
278
279                 while (nr--) {
280                         u64 start, len;
281
282                         if (logical[nr] > cache->key.objectid +
283                             cache->key.offset)
284                                 continue;
285
286                         if (logical[nr] + stripe_len <= cache->key.objectid)
287                                 continue;
288
289                         start = logical[nr];
290                         if (start < cache->key.objectid) {
291                                 start = cache->key.objectid;
292                                 len = (logical[nr] + stripe_len) - start;
293                         } else {
294                                 len = min_t(u64, stripe_len,
295                                             cache->key.objectid +
296                                             cache->key.offset - start);
297                         }
298
299                         cache->bytes_super += len;
300                         ret = add_excluded_extent(root, start, len);
301                         if (ret) {
302                                 kfree(logical);
303                                 return ret;
304                         }
305                 }
306
307                 kfree(logical);
308         }
309         return 0;
310 }
311
312 static struct btrfs_caching_control *
313 get_caching_control(struct btrfs_block_group_cache *cache)
314 {
315         struct btrfs_caching_control *ctl;
316
317         spin_lock(&cache->lock);
318         if (!cache->caching_ctl) {
319                 spin_unlock(&cache->lock);
320                 return NULL;
321         }
322
323         ctl = cache->caching_ctl;
324         atomic_inc(&ctl->count);
325         spin_unlock(&cache->lock);
326         return ctl;
327 }
328
329 static void put_caching_control(struct btrfs_caching_control *ctl)
330 {
331         if (atomic_dec_and_test(&ctl->count))
332                 kfree(ctl);
333 }
334
335 /*
336  * this is only called by cache_block_group, since we could have freed extents
337  * we need to check the pinned_extents for any extents that can't be used yet
338  * since their free space will be released as soon as the transaction commits.
339  */
340 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
341                               struct btrfs_fs_info *info, u64 start, u64 end)
342 {
343         u64 extent_start, extent_end, size, total_added = 0;
344         int ret;
345
346         while (start < end) {
347                 ret = find_first_extent_bit(info->pinned_extents, start,
348                                             &extent_start, &extent_end,
349                                             EXTENT_DIRTY | EXTENT_UPTODATE,
350                                             NULL);
351                 if (ret)
352                         break;
353
354                 if (extent_start <= start) {
355                         start = extent_end + 1;
356                 } else if (extent_start > start && extent_start < end) {
357                         size = extent_start - start;
358                         total_added += size;
359                         ret = btrfs_add_free_space(block_group, start,
360                                                    size);
361                         BUG_ON(ret); /* -ENOMEM or logic error */
362                         start = extent_end + 1;
363                 } else {
364                         break;
365                 }
366         }
367
368         if (start < end) {
369                 size = end - start;
370                 total_added += size;
371                 ret = btrfs_add_free_space(block_group, start, size);
372                 BUG_ON(ret); /* -ENOMEM or logic error */
373         }
374
375         return total_added;
376 }
377
378 static noinline void caching_thread(struct btrfs_work *work)
379 {
380         struct btrfs_block_group_cache *block_group;
381         struct btrfs_fs_info *fs_info;
382         struct btrfs_caching_control *caching_ctl;
383         struct btrfs_root *extent_root;
384         struct btrfs_path *path;
385         struct extent_buffer *leaf;
386         struct btrfs_key key;
387         u64 total_found = 0;
388         u64 last = 0;
389         u32 nritems;
390         int ret = -ENOMEM;
391
392         caching_ctl = container_of(work, struct btrfs_caching_control, work);
393         block_group = caching_ctl->block_group;
394         fs_info = block_group->fs_info;
395         extent_root = fs_info->extent_root;
396
397         path = btrfs_alloc_path();
398         if (!path)
399                 goto out;
400
401         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
402
403         /*
404          * We don't want to deadlock with somebody trying to allocate a new
405          * extent for the extent root while also trying to search the extent
406          * root to add free space.  So we skip locking and search the commit
407          * root, since its read-only
408          */
409         path->skip_locking = 1;
410         path->search_commit_root = 1;
411         path->reada = 1;
412
413         key.objectid = last;
414         key.offset = 0;
415         key.type = BTRFS_EXTENT_ITEM_KEY;
416 again:
417         mutex_lock(&caching_ctl->mutex);
418         /* need to make sure the commit_root doesn't disappear */
419         down_read(&fs_info->commit_root_sem);
420
421 next:
422         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
423         if (ret < 0)
424                 goto err;
425
426         leaf = path->nodes[0];
427         nritems = btrfs_header_nritems(leaf);
428
429         while (1) {
430                 if (btrfs_fs_closing(fs_info) > 1) {
431                         last = (u64)-1;
432                         break;
433                 }
434
435                 if (path->slots[0] < nritems) {
436                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
437                 } else {
438                         ret = find_next_key(path, 0, &key);
439                         if (ret)
440                                 break;
441
442                         if (need_resched() ||
443                             rwsem_is_contended(&fs_info->commit_root_sem)) {
444                                 caching_ctl->progress = last;
445                                 btrfs_release_path(path);
446                                 up_read(&fs_info->commit_root_sem);
447                                 mutex_unlock(&caching_ctl->mutex);
448                                 cond_resched();
449                                 goto again;
450                         }
451
452                         ret = btrfs_next_leaf(extent_root, path);
453                         if (ret < 0)
454                                 goto err;
455                         if (ret)
456                                 break;
457                         leaf = path->nodes[0];
458                         nritems = btrfs_header_nritems(leaf);
459                         continue;
460                 }
461
462                 if (key.objectid < last) {
463                         key.objectid = last;
464                         key.offset = 0;
465                         key.type = BTRFS_EXTENT_ITEM_KEY;
466
467                         caching_ctl->progress = last;
468                         btrfs_release_path(path);
469                         goto next;
470                 }
471
472                 if (key.objectid < block_group->key.objectid) {
473                         path->slots[0]++;
474                         continue;
475                 }
476
477                 if (key.objectid >= block_group->key.objectid +
478                     block_group->key.offset)
479                         break;
480
481                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
482                     key.type == BTRFS_METADATA_ITEM_KEY) {
483                         total_found += add_new_free_space(block_group,
484                                                           fs_info, last,
485                                                           key.objectid);
486                         if (key.type == BTRFS_METADATA_ITEM_KEY)
487                                 last = key.objectid +
488                                         fs_info->tree_root->nodesize;
489                         else
490                                 last = key.objectid + key.offset;
491
492                         if (total_found > (1024 * 1024 * 2)) {
493                                 total_found = 0;
494                                 wake_up(&caching_ctl->wait);
495                         }
496                 }
497                 path->slots[0]++;
498         }
499         ret = 0;
500
501         total_found += add_new_free_space(block_group, fs_info, last,
502                                           block_group->key.objectid +
503                                           block_group->key.offset);
504         caching_ctl->progress = (u64)-1;
505
506         spin_lock(&block_group->lock);
507         block_group->caching_ctl = NULL;
508         block_group->cached = BTRFS_CACHE_FINISHED;
509         spin_unlock(&block_group->lock);
510
511 err:
512         btrfs_free_path(path);
513         up_read(&fs_info->commit_root_sem);
514
515         free_excluded_extents(extent_root, block_group);
516
517         mutex_unlock(&caching_ctl->mutex);
518 out:
519         if (ret) {
520                 spin_lock(&block_group->lock);
521                 block_group->caching_ctl = NULL;
522                 block_group->cached = BTRFS_CACHE_ERROR;
523                 spin_unlock(&block_group->lock);
524         }
525         wake_up(&caching_ctl->wait);
526
527         put_caching_control(caching_ctl);
528         btrfs_put_block_group(block_group);
529 }
530
531 static int cache_block_group(struct btrfs_block_group_cache *cache,
532                              int load_cache_only)
533 {
534         DEFINE_WAIT(wait);
535         struct btrfs_fs_info *fs_info = cache->fs_info;
536         struct btrfs_caching_control *caching_ctl;
537         int ret = 0;
538
539         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
540         if (!caching_ctl)
541                 return -ENOMEM;
542
543         INIT_LIST_HEAD(&caching_ctl->list);
544         mutex_init(&caching_ctl->mutex);
545         init_waitqueue_head(&caching_ctl->wait);
546         caching_ctl->block_group = cache;
547         caching_ctl->progress = cache->key.objectid;
548         atomic_set(&caching_ctl->count, 1);
549         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
550                         caching_thread, NULL, NULL);
551
552         spin_lock(&cache->lock);
553         /*
554          * This should be a rare occasion, but this could happen I think in the
555          * case where one thread starts to load the space cache info, and then
556          * some other thread starts a transaction commit which tries to do an
557          * allocation while the other thread is still loading the space cache
558          * info.  The previous loop should have kept us from choosing this block
559          * group, but if we've moved to the state where we will wait on caching
560          * block groups we need to first check if we're doing a fast load here,
561          * so we can wait for it to finish, otherwise we could end up allocating
562          * from a block group who's cache gets evicted for one reason or
563          * another.
564          */
565         while (cache->cached == BTRFS_CACHE_FAST) {
566                 struct btrfs_caching_control *ctl;
567
568                 ctl = cache->caching_ctl;
569                 atomic_inc(&ctl->count);
570                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
571                 spin_unlock(&cache->lock);
572
573                 schedule();
574
575                 finish_wait(&ctl->wait, &wait);
576                 put_caching_control(ctl);
577                 spin_lock(&cache->lock);
578         }
579
580         if (cache->cached != BTRFS_CACHE_NO) {
581                 spin_unlock(&cache->lock);
582                 kfree(caching_ctl);
583                 return 0;
584         }
585         WARN_ON(cache->caching_ctl);
586         cache->caching_ctl = caching_ctl;
587         cache->cached = BTRFS_CACHE_FAST;
588         spin_unlock(&cache->lock);
589
590         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
591                 mutex_lock(&caching_ctl->mutex);
592                 ret = load_free_space_cache(fs_info, cache);
593
594                 spin_lock(&cache->lock);
595                 if (ret == 1) {
596                         cache->caching_ctl = NULL;
597                         cache->cached = BTRFS_CACHE_FINISHED;
598                         cache->last_byte_to_unpin = (u64)-1;
599                         caching_ctl->progress = (u64)-1;
600                 } else {
601                         if (load_cache_only) {
602                                 cache->caching_ctl = NULL;
603                                 cache->cached = BTRFS_CACHE_NO;
604                         } else {
605                                 cache->cached = BTRFS_CACHE_STARTED;
606                                 cache->has_caching_ctl = 1;
607                         }
608                 }
609                 spin_unlock(&cache->lock);
610                 mutex_unlock(&caching_ctl->mutex);
611
612                 wake_up(&caching_ctl->wait);
613                 if (ret == 1) {
614                         put_caching_control(caching_ctl);
615                         free_excluded_extents(fs_info->extent_root, cache);
616                         return 0;
617                 }
618         } else {
619                 /*
620                  * We are not going to do the fast caching, set cached to the
621                  * appropriate value and wakeup any waiters.
622                  */
623                 spin_lock(&cache->lock);
624                 if (load_cache_only) {
625                         cache->caching_ctl = NULL;
626                         cache->cached = BTRFS_CACHE_NO;
627                 } else {
628                         cache->cached = BTRFS_CACHE_STARTED;
629                         cache->has_caching_ctl = 1;
630                 }
631                 spin_unlock(&cache->lock);
632                 wake_up(&caching_ctl->wait);
633         }
634
635         if (load_cache_only) {
636                 put_caching_control(caching_ctl);
637                 return 0;
638         }
639
640         down_write(&fs_info->commit_root_sem);
641         atomic_inc(&caching_ctl->count);
642         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
643         up_write(&fs_info->commit_root_sem);
644
645         btrfs_get_block_group(cache);
646
647         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
648
649         return ret;
650 }
651
652 /*
653  * return the block group that starts at or after bytenr
654  */
655 static struct btrfs_block_group_cache *
656 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
657 {
658         struct btrfs_block_group_cache *cache;
659
660         cache = block_group_cache_tree_search(info, bytenr, 0);
661
662         return cache;
663 }
664
665 /*
666  * return the block group that contains the given bytenr
667  */
668 struct btrfs_block_group_cache *btrfs_lookup_block_group(
669                                                  struct btrfs_fs_info *info,
670                                                  u64 bytenr)
671 {
672         struct btrfs_block_group_cache *cache;
673
674         cache = block_group_cache_tree_search(info, bytenr, 1);
675
676         return cache;
677 }
678
679 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
680                                                   u64 flags)
681 {
682         struct list_head *head = &info->space_info;
683         struct btrfs_space_info *found;
684
685         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
686
687         rcu_read_lock();
688         list_for_each_entry_rcu(found, head, list) {
689                 if (found->flags & flags) {
690                         rcu_read_unlock();
691                         return found;
692                 }
693         }
694         rcu_read_unlock();
695         return NULL;
696 }
697
698 /*
699  * after adding space to the filesystem, we need to clear the full flags
700  * on all the space infos.
701  */
702 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
703 {
704         struct list_head *head = &info->space_info;
705         struct btrfs_space_info *found;
706
707         rcu_read_lock();
708         list_for_each_entry_rcu(found, head, list)
709                 found->full = 0;
710         rcu_read_unlock();
711 }
712
713 /* simple helper to search for an existing data extent at a given offset */
714 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
715 {
716         int ret;
717         struct btrfs_key key;
718         struct btrfs_path *path;
719
720         path = btrfs_alloc_path();
721         if (!path)
722                 return -ENOMEM;
723
724         key.objectid = start;
725         key.offset = len;
726         key.type = BTRFS_EXTENT_ITEM_KEY;
727         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
728                                 0, 0);
729         btrfs_free_path(path);
730         return ret;
731 }
732
733 /*
734  * helper function to lookup reference count and flags of a tree block.
735  *
736  * the head node for delayed ref is used to store the sum of all the
737  * reference count modifications queued up in the rbtree. the head
738  * node may also store the extent flags to set. This way you can check
739  * to see what the reference count and extent flags would be if all of
740  * the delayed refs are not processed.
741  */
742 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
743                              struct btrfs_root *root, u64 bytenr,
744                              u64 offset, int metadata, u64 *refs, u64 *flags)
745 {
746         struct btrfs_delayed_ref_head *head;
747         struct btrfs_delayed_ref_root *delayed_refs;
748         struct btrfs_path *path;
749         struct btrfs_extent_item *ei;
750         struct extent_buffer *leaf;
751         struct btrfs_key key;
752         u32 item_size;
753         u64 num_refs;
754         u64 extent_flags;
755         int ret;
756
757         /*
758          * If we don't have skinny metadata, don't bother doing anything
759          * different
760          */
761         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
762                 offset = root->nodesize;
763                 metadata = 0;
764         }
765
766         path = btrfs_alloc_path();
767         if (!path)
768                 return -ENOMEM;
769
770         if (!trans) {
771                 path->skip_locking = 1;
772                 path->search_commit_root = 1;
773         }
774
775 search_again:
776         key.objectid = bytenr;
777         key.offset = offset;
778         if (metadata)
779                 key.type = BTRFS_METADATA_ITEM_KEY;
780         else
781                 key.type = BTRFS_EXTENT_ITEM_KEY;
782
783         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
784                                 &key, path, 0, 0);
785         if (ret < 0)
786                 goto out_free;
787
788         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
789                 if (path->slots[0]) {
790                         path->slots[0]--;
791                         btrfs_item_key_to_cpu(path->nodes[0], &key,
792                                               path->slots[0]);
793                         if (key.objectid == bytenr &&
794                             key.type == BTRFS_EXTENT_ITEM_KEY &&
795                             key.offset == root->nodesize)
796                                 ret = 0;
797                 }
798         }
799
800         if (ret == 0) {
801                 leaf = path->nodes[0];
802                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
803                 if (item_size >= sizeof(*ei)) {
804                         ei = btrfs_item_ptr(leaf, path->slots[0],
805                                             struct btrfs_extent_item);
806                         num_refs = btrfs_extent_refs(leaf, ei);
807                         extent_flags = btrfs_extent_flags(leaf, ei);
808                 } else {
809 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
810                         struct btrfs_extent_item_v0 *ei0;
811                         BUG_ON(item_size != sizeof(*ei0));
812                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
813                                              struct btrfs_extent_item_v0);
814                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
815                         /* FIXME: this isn't correct for data */
816                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
817 #else
818                         BUG();
819 #endif
820                 }
821                 BUG_ON(num_refs == 0);
822         } else {
823                 num_refs = 0;
824                 extent_flags = 0;
825                 ret = 0;
826         }
827
828         if (!trans)
829                 goto out;
830
831         delayed_refs = &trans->transaction->delayed_refs;
832         spin_lock(&delayed_refs->lock);
833         head = btrfs_find_delayed_ref_head(trans, bytenr);
834         if (head) {
835                 if (!mutex_trylock(&head->mutex)) {
836                         atomic_inc(&head->node.refs);
837                         spin_unlock(&delayed_refs->lock);
838
839                         btrfs_release_path(path);
840
841                         /*
842                          * Mutex was contended, block until it's released and try
843                          * again
844                          */
845                         mutex_lock(&head->mutex);
846                         mutex_unlock(&head->mutex);
847                         btrfs_put_delayed_ref(&head->node);
848                         goto search_again;
849                 }
850                 spin_lock(&head->lock);
851                 if (head->extent_op && head->extent_op->update_flags)
852                         extent_flags |= head->extent_op->flags_to_set;
853                 else
854                         BUG_ON(num_refs == 0);
855
856                 num_refs += head->node.ref_mod;
857                 spin_unlock(&head->lock);
858                 mutex_unlock(&head->mutex);
859         }
860         spin_unlock(&delayed_refs->lock);
861 out:
862         WARN_ON(num_refs == 0);
863         if (refs)
864                 *refs = num_refs;
865         if (flags)
866                 *flags = extent_flags;
867 out_free:
868         btrfs_free_path(path);
869         return ret;
870 }
871
872 /*
873  * Back reference rules.  Back refs have three main goals:
874  *
875  * 1) differentiate between all holders of references to an extent so that
876  *    when a reference is dropped we can make sure it was a valid reference
877  *    before freeing the extent.
878  *
879  * 2) Provide enough information to quickly find the holders of an extent
880  *    if we notice a given block is corrupted or bad.
881  *
882  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
883  *    maintenance.  This is actually the same as #2, but with a slightly
884  *    different use case.
885  *
886  * There are two kinds of back refs. The implicit back refs is optimized
887  * for pointers in non-shared tree blocks. For a given pointer in a block,
888  * back refs of this kind provide information about the block's owner tree
889  * and the pointer's key. These information allow us to find the block by
890  * b-tree searching. The full back refs is for pointers in tree blocks not
891  * referenced by their owner trees. The location of tree block is recorded
892  * in the back refs. Actually the full back refs is generic, and can be
893  * used in all cases the implicit back refs is used. The major shortcoming
894  * of the full back refs is its overhead. Every time a tree block gets
895  * COWed, we have to update back refs entry for all pointers in it.
896  *
897  * For a newly allocated tree block, we use implicit back refs for
898  * pointers in it. This means most tree related operations only involve
899  * implicit back refs. For a tree block created in old transaction, the
900  * only way to drop a reference to it is COW it. So we can detect the
901  * event that tree block loses its owner tree's reference and do the
902  * back refs conversion.
903  *
904  * When a tree block is COW'd through a tree, there are four cases:
905  *
906  * The reference count of the block is one and the tree is the block's
907  * owner tree. Nothing to do in this case.
908  *
909  * The reference count of the block is one and the tree is not the
910  * block's owner tree. In this case, full back refs is used for pointers
911  * in the block. Remove these full back refs, add implicit back refs for
912  * every pointers in the new block.
913  *
914  * The reference count of the block is greater than one and the tree is
915  * the block's owner tree. In this case, implicit back refs is used for
916  * pointers in the block. Add full back refs for every pointers in the
917  * block, increase lower level extents' reference counts. The original
918  * implicit back refs are entailed to the new block.
919  *
920  * The reference count of the block is greater than one and the tree is
921  * not the block's owner tree. Add implicit back refs for every pointer in
922  * the new block, increase lower level extents' reference count.
923  *
924  * Back Reference Key composing:
925  *
926  * The key objectid corresponds to the first byte in the extent,
927  * The key type is used to differentiate between types of back refs.
928  * There are different meanings of the key offset for different types
929  * of back refs.
930  *
931  * File extents can be referenced by:
932  *
933  * - multiple snapshots, subvolumes, or different generations in one subvol
934  * - different files inside a single subvolume
935  * - different offsets inside a file (bookend extents in file.c)
936  *
937  * The extent ref structure for the implicit back refs has fields for:
938  *
939  * - Objectid of the subvolume root
940  * - objectid of the file holding the reference
941  * - original offset in the file
942  * - how many bookend extents
943  *
944  * The key offset for the implicit back refs is hash of the first
945  * three fields.
946  *
947  * The extent ref structure for the full back refs has field for:
948  *
949  * - number of pointers in the tree leaf
950  *
951  * The key offset for the implicit back refs is the first byte of
952  * the tree leaf
953  *
954  * When a file extent is allocated, The implicit back refs is used.
955  * the fields are filled in:
956  *
957  *     (root_key.objectid, inode objectid, offset in file, 1)
958  *
959  * When a file extent is removed file truncation, we find the
960  * corresponding implicit back refs and check the following fields:
961  *
962  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
963  *
964  * Btree extents can be referenced by:
965  *
966  * - Different subvolumes
967  *
968  * Both the implicit back refs and the full back refs for tree blocks
969  * only consist of key. The key offset for the implicit back refs is
970  * objectid of block's owner tree. The key offset for the full back refs
971  * is the first byte of parent block.
972  *
973  * When implicit back refs is used, information about the lowest key and
974  * level of the tree block are required. These information are stored in
975  * tree block info structure.
976  */
977
978 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
979 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
980                                   struct btrfs_root *root,
981                                   struct btrfs_path *path,
982                                   u64 owner, u32 extra_size)
983 {
984         struct btrfs_extent_item *item;
985         struct btrfs_extent_item_v0 *ei0;
986         struct btrfs_extent_ref_v0 *ref0;
987         struct btrfs_tree_block_info *bi;
988         struct extent_buffer *leaf;
989         struct btrfs_key key;
990         struct btrfs_key found_key;
991         u32 new_size = sizeof(*item);
992         u64 refs;
993         int ret;
994
995         leaf = path->nodes[0];
996         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
997
998         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
999         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1000                              struct btrfs_extent_item_v0);
1001         refs = btrfs_extent_refs_v0(leaf, ei0);
1002
1003         if (owner == (u64)-1) {
1004                 while (1) {
1005                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1006                                 ret = btrfs_next_leaf(root, path);
1007                                 if (ret < 0)
1008                                         return ret;
1009                                 BUG_ON(ret > 0); /* Corruption */
1010                                 leaf = path->nodes[0];
1011                         }
1012                         btrfs_item_key_to_cpu(leaf, &found_key,
1013                                               path->slots[0]);
1014                         BUG_ON(key.objectid != found_key.objectid);
1015                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1016                                 path->slots[0]++;
1017                                 continue;
1018                         }
1019                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1020                                               struct btrfs_extent_ref_v0);
1021                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1022                         break;
1023                 }
1024         }
1025         btrfs_release_path(path);
1026
1027         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1028                 new_size += sizeof(*bi);
1029
1030         new_size -= sizeof(*ei0);
1031         ret = btrfs_search_slot(trans, root, &key, path,
1032                                 new_size + extra_size, 1);
1033         if (ret < 0)
1034                 return ret;
1035         BUG_ON(ret); /* Corruption */
1036
1037         btrfs_extend_item(root, path, new_size);
1038
1039         leaf = path->nodes[0];
1040         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1041         btrfs_set_extent_refs(leaf, item, refs);
1042         /* FIXME: get real generation */
1043         btrfs_set_extent_generation(leaf, item, 0);
1044         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1045                 btrfs_set_extent_flags(leaf, item,
1046                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1047                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1048                 bi = (struct btrfs_tree_block_info *)(item + 1);
1049                 /* FIXME: get first key of the block */
1050                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1051                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1052         } else {
1053                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1054         }
1055         btrfs_mark_buffer_dirty(leaf);
1056         return 0;
1057 }
1058 #endif
1059
1060 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1061 {
1062         u32 high_crc = ~(u32)0;
1063         u32 low_crc = ~(u32)0;
1064         __le64 lenum;
1065
1066         lenum = cpu_to_le64(root_objectid);
1067         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1068         lenum = cpu_to_le64(owner);
1069         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1070         lenum = cpu_to_le64(offset);
1071         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1072
1073         return ((u64)high_crc << 31) ^ (u64)low_crc;
1074 }
1075
1076 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1077                                      struct btrfs_extent_data_ref *ref)
1078 {
1079         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1080                                     btrfs_extent_data_ref_objectid(leaf, ref),
1081                                     btrfs_extent_data_ref_offset(leaf, ref));
1082 }
1083
1084 static int match_extent_data_ref(struct extent_buffer *leaf,
1085                                  struct btrfs_extent_data_ref *ref,
1086                                  u64 root_objectid, u64 owner, u64 offset)
1087 {
1088         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1089             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1090             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1091                 return 0;
1092         return 1;
1093 }
1094
1095 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1096                                            struct btrfs_root *root,
1097                                            struct btrfs_path *path,
1098                                            u64 bytenr, u64 parent,
1099                                            u64 root_objectid,
1100                                            u64 owner, u64 offset)
1101 {
1102         struct btrfs_key key;
1103         struct btrfs_extent_data_ref *ref;
1104         struct extent_buffer *leaf;
1105         u32 nritems;
1106         int ret;
1107         int recow;
1108         int err = -ENOENT;
1109
1110         key.objectid = bytenr;
1111         if (parent) {
1112                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1113                 key.offset = parent;
1114         } else {
1115                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1116                 key.offset = hash_extent_data_ref(root_objectid,
1117                                                   owner, offset);
1118         }
1119 again:
1120         recow = 0;
1121         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1122         if (ret < 0) {
1123                 err = ret;
1124                 goto fail;
1125         }
1126
1127         if (parent) {
1128                 if (!ret)
1129                         return 0;
1130 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1131                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1132                 btrfs_release_path(path);
1133                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1134                 if (ret < 0) {
1135                         err = ret;
1136                         goto fail;
1137                 }
1138                 if (!ret)
1139                         return 0;
1140 #endif
1141                 goto fail;
1142         }
1143
1144         leaf = path->nodes[0];
1145         nritems = btrfs_header_nritems(leaf);
1146         while (1) {
1147                 if (path->slots[0] >= nritems) {
1148                         ret = btrfs_next_leaf(root, path);
1149                         if (ret < 0)
1150                                 err = ret;
1151                         if (ret)
1152                                 goto fail;
1153
1154                         leaf = path->nodes[0];
1155                         nritems = btrfs_header_nritems(leaf);
1156                         recow = 1;
1157                 }
1158
1159                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1160                 if (key.objectid != bytenr ||
1161                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1162                         goto fail;
1163
1164                 ref = btrfs_item_ptr(leaf, path->slots[0],
1165                                      struct btrfs_extent_data_ref);
1166
1167                 if (match_extent_data_ref(leaf, ref, root_objectid,
1168                                           owner, offset)) {
1169                         if (recow) {
1170                                 btrfs_release_path(path);
1171                                 goto again;
1172                         }
1173                         err = 0;
1174                         break;
1175                 }
1176                 path->slots[0]++;
1177         }
1178 fail:
1179         return err;
1180 }
1181
1182 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1183                                            struct btrfs_root *root,
1184                                            struct btrfs_path *path,
1185                                            u64 bytenr, u64 parent,
1186                                            u64 root_objectid, u64 owner,
1187                                            u64 offset, int refs_to_add)
1188 {
1189         struct btrfs_key key;
1190         struct extent_buffer *leaf;
1191         u32 size;
1192         u32 num_refs;
1193         int ret;
1194
1195         key.objectid = bytenr;
1196         if (parent) {
1197                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1198                 key.offset = parent;
1199                 size = sizeof(struct btrfs_shared_data_ref);
1200         } else {
1201                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1202                 key.offset = hash_extent_data_ref(root_objectid,
1203                                                   owner, offset);
1204                 size = sizeof(struct btrfs_extent_data_ref);
1205         }
1206
1207         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1208         if (ret && ret != -EEXIST)
1209                 goto fail;
1210
1211         leaf = path->nodes[0];
1212         if (parent) {
1213                 struct btrfs_shared_data_ref *ref;
1214                 ref = btrfs_item_ptr(leaf, path->slots[0],
1215                                      struct btrfs_shared_data_ref);
1216                 if (ret == 0) {
1217                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1218                 } else {
1219                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1220                         num_refs += refs_to_add;
1221                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1222                 }
1223         } else {
1224                 struct btrfs_extent_data_ref *ref;
1225                 while (ret == -EEXIST) {
1226                         ref = btrfs_item_ptr(leaf, path->slots[0],
1227                                              struct btrfs_extent_data_ref);
1228                         if (match_extent_data_ref(leaf, ref, root_objectid,
1229                                                   owner, offset))
1230                                 break;
1231                         btrfs_release_path(path);
1232                         key.offset++;
1233                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1234                                                       size);
1235                         if (ret && ret != -EEXIST)
1236                                 goto fail;
1237
1238                         leaf = path->nodes[0];
1239                 }
1240                 ref = btrfs_item_ptr(leaf, path->slots[0],
1241                                      struct btrfs_extent_data_ref);
1242                 if (ret == 0) {
1243                         btrfs_set_extent_data_ref_root(leaf, ref,
1244                                                        root_objectid);
1245                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1246                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1247                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1248                 } else {
1249                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1250                         num_refs += refs_to_add;
1251                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1252                 }
1253         }
1254         btrfs_mark_buffer_dirty(leaf);
1255         ret = 0;
1256 fail:
1257         btrfs_release_path(path);
1258         return ret;
1259 }
1260
1261 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1262                                            struct btrfs_root *root,
1263                                            struct btrfs_path *path,
1264                                            int refs_to_drop, int *last_ref)
1265 {
1266         struct btrfs_key key;
1267         struct btrfs_extent_data_ref *ref1 = NULL;
1268         struct btrfs_shared_data_ref *ref2 = NULL;
1269         struct extent_buffer *leaf;
1270         u32 num_refs = 0;
1271         int ret = 0;
1272
1273         leaf = path->nodes[0];
1274         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1275
1276         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1277                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1278                                       struct btrfs_extent_data_ref);
1279                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1280         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1281                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1282                                       struct btrfs_shared_data_ref);
1283                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1284 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1285         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1286                 struct btrfs_extent_ref_v0 *ref0;
1287                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1288                                       struct btrfs_extent_ref_v0);
1289                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1290 #endif
1291         } else {
1292                 BUG();
1293         }
1294
1295         BUG_ON(num_refs < refs_to_drop);
1296         num_refs -= refs_to_drop;
1297
1298         if (num_refs == 0) {
1299                 ret = btrfs_del_item(trans, root, path);
1300                 *last_ref = 1;
1301         } else {
1302                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1303                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1304                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1305                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1306 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1307                 else {
1308                         struct btrfs_extent_ref_v0 *ref0;
1309                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1310                                         struct btrfs_extent_ref_v0);
1311                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1312                 }
1313 #endif
1314                 btrfs_mark_buffer_dirty(leaf);
1315         }
1316         return ret;
1317 }
1318
1319 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1320                                           struct btrfs_path *path,
1321                                           struct btrfs_extent_inline_ref *iref)
1322 {
1323         struct btrfs_key key;
1324         struct extent_buffer *leaf;
1325         struct btrfs_extent_data_ref *ref1;
1326         struct btrfs_shared_data_ref *ref2;
1327         u32 num_refs = 0;
1328
1329         leaf = path->nodes[0];
1330         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1331         if (iref) {
1332                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1333                     BTRFS_EXTENT_DATA_REF_KEY) {
1334                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1335                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1336                 } else {
1337                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1338                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1339                 }
1340         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1341                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1342                                       struct btrfs_extent_data_ref);
1343                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1344         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1345                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1346                                       struct btrfs_shared_data_ref);
1347                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1348 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1349         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1350                 struct btrfs_extent_ref_v0 *ref0;
1351                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1352                                       struct btrfs_extent_ref_v0);
1353                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1354 #endif
1355         } else {
1356                 WARN_ON(1);
1357         }
1358         return num_refs;
1359 }
1360
1361 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1362                                           struct btrfs_root *root,
1363                                           struct btrfs_path *path,
1364                                           u64 bytenr, u64 parent,
1365                                           u64 root_objectid)
1366 {
1367         struct btrfs_key key;
1368         int ret;
1369
1370         key.objectid = bytenr;
1371         if (parent) {
1372                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1373                 key.offset = parent;
1374         } else {
1375                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1376                 key.offset = root_objectid;
1377         }
1378
1379         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1380         if (ret > 0)
1381                 ret = -ENOENT;
1382 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1383         if (ret == -ENOENT && parent) {
1384                 btrfs_release_path(path);
1385                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1386                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1387                 if (ret > 0)
1388                         ret = -ENOENT;
1389         }
1390 #endif
1391         return ret;
1392 }
1393
1394 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1395                                           struct btrfs_root *root,
1396                                           struct btrfs_path *path,
1397                                           u64 bytenr, u64 parent,
1398                                           u64 root_objectid)
1399 {
1400         struct btrfs_key key;
1401         int ret;
1402
1403         key.objectid = bytenr;
1404         if (parent) {
1405                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1406                 key.offset = parent;
1407         } else {
1408                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1409                 key.offset = root_objectid;
1410         }
1411
1412         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1413         btrfs_release_path(path);
1414         return ret;
1415 }
1416
1417 static inline int extent_ref_type(u64 parent, u64 owner)
1418 {
1419         int type;
1420         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1421                 if (parent > 0)
1422                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1423                 else
1424                         type = BTRFS_TREE_BLOCK_REF_KEY;
1425         } else {
1426                 if (parent > 0)
1427                         type = BTRFS_SHARED_DATA_REF_KEY;
1428                 else
1429                         type = BTRFS_EXTENT_DATA_REF_KEY;
1430         }
1431         return type;
1432 }
1433
1434 static int find_next_key(struct btrfs_path *path, int level,
1435                          struct btrfs_key *key)
1436
1437 {
1438         for (; level < BTRFS_MAX_LEVEL; level++) {
1439                 if (!path->nodes[level])
1440                         break;
1441                 if (path->slots[level] + 1 >=
1442                     btrfs_header_nritems(path->nodes[level]))
1443                         continue;
1444                 if (level == 0)
1445                         btrfs_item_key_to_cpu(path->nodes[level], key,
1446                                               path->slots[level] + 1);
1447                 else
1448                         btrfs_node_key_to_cpu(path->nodes[level], key,
1449                                               path->slots[level] + 1);
1450                 return 0;
1451         }
1452         return 1;
1453 }
1454
1455 /*
1456  * look for inline back ref. if back ref is found, *ref_ret is set
1457  * to the address of inline back ref, and 0 is returned.
1458  *
1459  * if back ref isn't found, *ref_ret is set to the address where it
1460  * should be inserted, and -ENOENT is returned.
1461  *
1462  * if insert is true and there are too many inline back refs, the path
1463  * points to the extent item, and -EAGAIN is returned.
1464  *
1465  * NOTE: inline back refs are ordered in the same way that back ref
1466  *       items in the tree are ordered.
1467  */
1468 static noinline_for_stack
1469 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1470                                  struct btrfs_root *root,
1471                                  struct btrfs_path *path,
1472                                  struct btrfs_extent_inline_ref **ref_ret,
1473                                  u64 bytenr, u64 num_bytes,
1474                                  u64 parent, u64 root_objectid,
1475                                  u64 owner, u64 offset, int insert)
1476 {
1477         struct btrfs_key key;
1478         struct extent_buffer *leaf;
1479         struct btrfs_extent_item *ei;
1480         struct btrfs_extent_inline_ref *iref;
1481         u64 flags;
1482         u64 item_size;
1483         unsigned long ptr;
1484         unsigned long end;
1485         int extra_size;
1486         int type;
1487         int want;
1488         int ret;
1489         int err = 0;
1490         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1491                                                  SKINNY_METADATA);
1492
1493         key.objectid = bytenr;
1494         key.type = BTRFS_EXTENT_ITEM_KEY;
1495         key.offset = num_bytes;
1496
1497         want = extent_ref_type(parent, owner);
1498         if (insert) {
1499                 extra_size = btrfs_extent_inline_ref_size(want);
1500                 path->keep_locks = 1;
1501         } else
1502                 extra_size = -1;
1503
1504         /*
1505          * Owner is our parent level, so we can just add one to get the level
1506          * for the block we are interested in.
1507          */
1508         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1509                 key.type = BTRFS_METADATA_ITEM_KEY;
1510                 key.offset = owner;
1511         }
1512
1513 again:
1514         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1515         if (ret < 0) {
1516                 err = ret;
1517                 goto out;
1518         }
1519
1520         /*
1521          * We may be a newly converted file system which still has the old fat
1522          * extent entries for metadata, so try and see if we have one of those.
1523          */
1524         if (ret > 0 && skinny_metadata) {
1525                 skinny_metadata = false;
1526                 if (path->slots[0]) {
1527                         path->slots[0]--;
1528                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1529                                               path->slots[0]);
1530                         if (key.objectid == bytenr &&
1531                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1532                             key.offset == num_bytes)
1533                                 ret = 0;
1534                 }
1535                 if (ret) {
1536                         key.objectid = bytenr;
1537                         key.type = BTRFS_EXTENT_ITEM_KEY;
1538                         key.offset = num_bytes;
1539                         btrfs_release_path(path);
1540                         goto again;
1541                 }
1542         }
1543
1544         if (ret && !insert) {
1545                 err = -ENOENT;
1546                 goto out;
1547         } else if (WARN_ON(ret)) {
1548                 err = -EIO;
1549                 goto out;
1550         }
1551
1552         leaf = path->nodes[0];
1553         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1554 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1555         if (item_size < sizeof(*ei)) {
1556                 if (!insert) {
1557                         err = -ENOENT;
1558                         goto out;
1559                 }
1560                 ret = convert_extent_item_v0(trans, root, path, owner,
1561                                              extra_size);
1562                 if (ret < 0) {
1563                         err = ret;
1564                         goto out;
1565                 }
1566                 leaf = path->nodes[0];
1567                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1568         }
1569 #endif
1570         BUG_ON(item_size < sizeof(*ei));
1571
1572         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1573         flags = btrfs_extent_flags(leaf, ei);
1574
1575         ptr = (unsigned long)(ei + 1);
1576         end = (unsigned long)ei + item_size;
1577
1578         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1579                 ptr += sizeof(struct btrfs_tree_block_info);
1580                 BUG_ON(ptr > end);
1581         }
1582
1583         err = -ENOENT;
1584         while (1) {
1585                 if (ptr >= end) {
1586                         WARN_ON(ptr > end);
1587                         break;
1588                 }
1589                 iref = (struct btrfs_extent_inline_ref *)ptr;
1590                 type = btrfs_extent_inline_ref_type(leaf, iref);
1591                 if (want < type)
1592                         break;
1593                 if (want > type) {
1594                         ptr += btrfs_extent_inline_ref_size(type);
1595                         continue;
1596                 }
1597
1598                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1599                         struct btrfs_extent_data_ref *dref;
1600                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1601                         if (match_extent_data_ref(leaf, dref, root_objectid,
1602                                                   owner, offset)) {
1603                                 err = 0;
1604                                 break;
1605                         }
1606                         if (hash_extent_data_ref_item(leaf, dref) <
1607                             hash_extent_data_ref(root_objectid, owner, offset))
1608                                 break;
1609                 } else {
1610                         u64 ref_offset;
1611                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1612                         if (parent > 0) {
1613                                 if (parent == ref_offset) {
1614                                         err = 0;
1615                                         break;
1616                                 }
1617                                 if (ref_offset < parent)
1618                                         break;
1619                         } else {
1620                                 if (root_objectid == ref_offset) {
1621                                         err = 0;
1622                                         break;
1623                                 }
1624                                 if (ref_offset < root_objectid)
1625                                         break;
1626                         }
1627                 }
1628                 ptr += btrfs_extent_inline_ref_size(type);
1629         }
1630         if (err == -ENOENT && insert) {
1631                 if (item_size + extra_size >=
1632                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1633                         err = -EAGAIN;
1634                         goto out;
1635                 }
1636                 /*
1637                  * To add new inline back ref, we have to make sure
1638                  * there is no corresponding back ref item.
1639                  * For simplicity, we just do not add new inline back
1640                  * ref if there is any kind of item for this block
1641                  */
1642                 if (find_next_key(path, 0, &key) == 0 &&
1643                     key.objectid == bytenr &&
1644                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1645                         err = -EAGAIN;
1646                         goto out;
1647                 }
1648         }
1649         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1650 out:
1651         if (insert) {
1652                 path->keep_locks = 0;
1653                 btrfs_unlock_up_safe(path, 1);
1654         }
1655         return err;
1656 }
1657
1658 /*
1659  * helper to add new inline back ref
1660  */
1661 static noinline_for_stack
1662 void setup_inline_extent_backref(struct btrfs_root *root,
1663                                  struct btrfs_path *path,
1664                                  struct btrfs_extent_inline_ref *iref,
1665                                  u64 parent, u64 root_objectid,
1666                                  u64 owner, u64 offset, int refs_to_add,
1667                                  struct btrfs_delayed_extent_op *extent_op)
1668 {
1669         struct extent_buffer *leaf;
1670         struct btrfs_extent_item *ei;
1671         unsigned long ptr;
1672         unsigned long end;
1673         unsigned long item_offset;
1674         u64 refs;
1675         int size;
1676         int type;
1677
1678         leaf = path->nodes[0];
1679         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1680         item_offset = (unsigned long)iref - (unsigned long)ei;
1681
1682         type = extent_ref_type(parent, owner);
1683         size = btrfs_extent_inline_ref_size(type);
1684
1685         btrfs_extend_item(root, path, size);
1686
1687         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1688         refs = btrfs_extent_refs(leaf, ei);
1689         refs += refs_to_add;
1690         btrfs_set_extent_refs(leaf, ei, refs);
1691         if (extent_op)
1692                 __run_delayed_extent_op(extent_op, leaf, ei);
1693
1694         ptr = (unsigned long)ei + item_offset;
1695         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1696         if (ptr < end - size)
1697                 memmove_extent_buffer(leaf, ptr + size, ptr,
1698                                       end - size - ptr);
1699
1700         iref = (struct btrfs_extent_inline_ref *)ptr;
1701         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1702         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1703                 struct btrfs_extent_data_ref *dref;
1704                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1705                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1706                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1707                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1708                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1709         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1710                 struct btrfs_shared_data_ref *sref;
1711                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1712                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1713                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1714         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1715                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1716         } else {
1717                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1718         }
1719         btrfs_mark_buffer_dirty(leaf);
1720 }
1721
1722 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1723                                  struct btrfs_root *root,
1724                                  struct btrfs_path *path,
1725                                  struct btrfs_extent_inline_ref **ref_ret,
1726                                  u64 bytenr, u64 num_bytes, u64 parent,
1727                                  u64 root_objectid, u64 owner, u64 offset)
1728 {
1729         int ret;
1730
1731         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1732                                            bytenr, num_bytes, parent,
1733                                            root_objectid, owner, offset, 0);
1734         if (ret != -ENOENT)
1735                 return ret;
1736
1737         btrfs_release_path(path);
1738         *ref_ret = NULL;
1739
1740         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1741                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1742                                             root_objectid);
1743         } else {
1744                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1745                                              root_objectid, owner, offset);
1746         }
1747         return ret;
1748 }
1749
1750 /*
1751  * helper to update/remove inline back ref
1752  */
1753 static noinline_for_stack
1754 void update_inline_extent_backref(struct btrfs_root *root,
1755                                   struct btrfs_path *path,
1756                                   struct btrfs_extent_inline_ref *iref,
1757                                   int refs_to_mod,
1758                                   struct btrfs_delayed_extent_op *extent_op,
1759                                   int *last_ref)
1760 {
1761         struct extent_buffer *leaf;
1762         struct btrfs_extent_item *ei;
1763         struct btrfs_extent_data_ref *dref = NULL;
1764         struct btrfs_shared_data_ref *sref = NULL;
1765         unsigned long ptr;
1766         unsigned long end;
1767         u32 item_size;
1768         int size;
1769         int type;
1770         u64 refs;
1771
1772         leaf = path->nodes[0];
1773         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1774         refs = btrfs_extent_refs(leaf, ei);
1775         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1776         refs += refs_to_mod;
1777         btrfs_set_extent_refs(leaf, ei, refs);
1778         if (extent_op)
1779                 __run_delayed_extent_op(extent_op, leaf, ei);
1780
1781         type = btrfs_extent_inline_ref_type(leaf, iref);
1782
1783         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1784                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1785                 refs = btrfs_extent_data_ref_count(leaf, dref);
1786         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1787                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1788                 refs = btrfs_shared_data_ref_count(leaf, sref);
1789         } else {
1790                 refs = 1;
1791                 BUG_ON(refs_to_mod != -1);
1792         }
1793
1794         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1795         refs += refs_to_mod;
1796
1797         if (refs > 0) {
1798                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1799                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1800                 else
1801                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1802         } else {
1803                 *last_ref = 1;
1804                 size =  btrfs_extent_inline_ref_size(type);
1805                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1806                 ptr = (unsigned long)iref;
1807                 end = (unsigned long)ei + item_size;
1808                 if (ptr + size < end)
1809                         memmove_extent_buffer(leaf, ptr, ptr + size,
1810                                               end - ptr - size);
1811                 item_size -= size;
1812                 btrfs_truncate_item(root, path, item_size, 1);
1813         }
1814         btrfs_mark_buffer_dirty(leaf);
1815 }
1816
1817 static noinline_for_stack
1818 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1819                                  struct btrfs_root *root,
1820                                  struct btrfs_path *path,
1821                                  u64 bytenr, u64 num_bytes, u64 parent,
1822                                  u64 root_objectid, u64 owner,
1823                                  u64 offset, int refs_to_add,
1824                                  struct btrfs_delayed_extent_op *extent_op)
1825 {
1826         struct btrfs_extent_inline_ref *iref;
1827         int ret;
1828
1829         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1830                                            bytenr, num_bytes, parent,
1831                                            root_objectid, owner, offset, 1);
1832         if (ret == 0) {
1833                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1834                 update_inline_extent_backref(root, path, iref,
1835                                              refs_to_add, extent_op, NULL);
1836         } else if (ret == -ENOENT) {
1837                 setup_inline_extent_backref(root, path, iref, parent,
1838                                             root_objectid, owner, offset,
1839                                             refs_to_add, extent_op);
1840                 ret = 0;
1841         }
1842         return ret;
1843 }
1844
1845 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1846                                  struct btrfs_root *root,
1847                                  struct btrfs_path *path,
1848                                  u64 bytenr, u64 parent, u64 root_objectid,
1849                                  u64 owner, u64 offset, int refs_to_add)
1850 {
1851         int ret;
1852         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1853                 BUG_ON(refs_to_add != 1);
1854                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1855                                             parent, root_objectid);
1856         } else {
1857                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1858                                              parent, root_objectid,
1859                                              owner, offset, refs_to_add);
1860         }
1861         return ret;
1862 }
1863
1864 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1865                                  struct btrfs_root *root,
1866                                  struct btrfs_path *path,
1867                                  struct btrfs_extent_inline_ref *iref,
1868                                  int refs_to_drop, int is_data, int *last_ref)
1869 {
1870         int ret = 0;
1871
1872         BUG_ON(!is_data && refs_to_drop != 1);
1873         if (iref) {
1874                 update_inline_extent_backref(root, path, iref,
1875                                              -refs_to_drop, NULL, last_ref);
1876         } else if (is_data) {
1877                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1878                                              last_ref);
1879         } else {
1880                 *last_ref = 1;
1881                 ret = btrfs_del_item(trans, root, path);
1882         }
1883         return ret;
1884 }
1885
1886 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1887                                u64 *discarded_bytes)
1888 {
1889         int ret = 0;
1890
1891         *discarded_bytes = 0;
1892         ret = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1893         if (!ret)
1894                 *discarded_bytes = len;
1895
1896         return ret;
1897 }
1898
1899 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1900                          u64 num_bytes, u64 *actual_bytes)
1901 {
1902         int ret;
1903         u64 discarded_bytes = 0;
1904         struct btrfs_bio *bbio = NULL;
1905
1906
1907         /* Tell the block device(s) that the sectors can be discarded */
1908         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1909                               bytenr, &num_bytes, &bbio, 0);
1910         /* Error condition is -ENOMEM */
1911         if (!ret) {
1912                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1913                 int i;
1914
1915
1916                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1917                         u64 bytes;
1918                         if (!stripe->dev->can_discard)
1919                                 continue;
1920
1921                         ret = btrfs_issue_discard(stripe->dev->bdev,
1922                                                   stripe->physical,
1923                                                   stripe->length,
1924                                                   &bytes);
1925                         if (!ret)
1926                                 discarded_bytes += bytes;
1927                         else if (ret != -EOPNOTSUPP)
1928                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1929
1930                         /*
1931                          * Just in case we get back EOPNOTSUPP for some reason,
1932                          * just ignore the return value so we don't screw up
1933                          * people calling discard_extent.
1934                          */
1935                         ret = 0;
1936                 }
1937                 btrfs_put_bbio(bbio);
1938         }
1939
1940         if (actual_bytes)
1941                 *actual_bytes = discarded_bytes;
1942
1943
1944         if (ret == -EOPNOTSUPP)
1945                 ret = 0;
1946         return ret;
1947 }
1948
1949 /* Can return -ENOMEM */
1950 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1951                          struct btrfs_root *root,
1952                          u64 bytenr, u64 num_bytes, u64 parent,
1953                          u64 root_objectid, u64 owner, u64 offset,
1954                          int no_quota)
1955 {
1956         int ret;
1957         struct btrfs_fs_info *fs_info = root->fs_info;
1958
1959         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1960                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1961
1962         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1963                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1964                                         num_bytes,
1965                                         parent, root_objectid, (int)owner,
1966                                         BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1967         } else {
1968                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1969                                         num_bytes,
1970                                         parent, root_objectid, owner, offset,
1971                                         BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1972         }
1973         return ret;
1974 }
1975
1976 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1977                                   struct btrfs_root *root,
1978                                   struct btrfs_delayed_ref_node *node,
1979                                   u64 parent, u64 root_objectid,
1980                                   u64 owner, u64 offset, int refs_to_add,
1981                                   struct btrfs_delayed_extent_op *extent_op)
1982 {
1983         struct btrfs_fs_info *fs_info = root->fs_info;
1984         struct btrfs_path *path;
1985         struct extent_buffer *leaf;
1986         struct btrfs_extent_item *item;
1987         struct btrfs_key key;
1988         u64 bytenr = node->bytenr;
1989         u64 num_bytes = node->num_bytes;
1990         u64 refs;
1991         int ret;
1992         int no_quota = node->no_quota;
1993
1994         path = btrfs_alloc_path();
1995         if (!path)
1996                 return -ENOMEM;
1997
1998         if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
1999                 no_quota = 1;
2000
2001         path->reada = 1;
2002         path->leave_spinning = 1;
2003         /* this will setup the path even if it fails to insert the back ref */
2004         ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2005                                            bytenr, num_bytes, parent,
2006                                            root_objectid, owner, offset,
2007                                            refs_to_add, extent_op);
2008         if ((ret < 0 && ret != -EAGAIN) || !ret)
2009                 goto out;
2010
2011         /*
2012          * Ok we had -EAGAIN which means we didn't have space to insert and
2013          * inline extent ref, so just update the reference count and add a
2014          * normal backref.
2015          */
2016         leaf = path->nodes[0];
2017         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2018         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2019         refs = btrfs_extent_refs(leaf, item);
2020         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2021         if (extent_op)
2022                 __run_delayed_extent_op(extent_op, leaf, item);
2023
2024         btrfs_mark_buffer_dirty(leaf);
2025         btrfs_release_path(path);
2026
2027         path->reada = 1;
2028         path->leave_spinning = 1;
2029         /* now insert the actual backref */
2030         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2031                                     path, bytenr, parent, root_objectid,
2032                                     owner, offset, refs_to_add);
2033         if (ret)
2034                 btrfs_abort_transaction(trans, root, ret);
2035 out:
2036         btrfs_free_path(path);
2037         return ret;
2038 }
2039
2040 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2041                                 struct btrfs_root *root,
2042                                 struct btrfs_delayed_ref_node *node,
2043                                 struct btrfs_delayed_extent_op *extent_op,
2044                                 int insert_reserved)
2045 {
2046         int ret = 0;
2047         struct btrfs_delayed_data_ref *ref;
2048         struct btrfs_key ins;
2049         u64 parent = 0;
2050         u64 ref_root = 0;
2051         u64 flags = 0;
2052
2053         ins.objectid = node->bytenr;
2054         ins.offset = node->num_bytes;
2055         ins.type = BTRFS_EXTENT_ITEM_KEY;
2056
2057         ref = btrfs_delayed_node_to_data_ref(node);
2058         trace_run_delayed_data_ref(node, ref, node->action);
2059
2060         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2061                 parent = ref->parent;
2062         ref_root = ref->root;
2063
2064         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2065                 if (extent_op)
2066                         flags |= extent_op->flags_to_set;
2067                 ret = alloc_reserved_file_extent(trans, root,
2068                                                  parent, ref_root, flags,
2069                                                  ref->objectid, ref->offset,
2070                                                  &ins, node->ref_mod);
2071         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2072                 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2073                                              ref_root, ref->objectid,
2074                                              ref->offset, node->ref_mod,
2075                                              extent_op);
2076         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2077                 ret = __btrfs_free_extent(trans, root, node, parent,
2078                                           ref_root, ref->objectid,
2079                                           ref->offset, node->ref_mod,
2080                                           extent_op);
2081         } else {
2082                 BUG();
2083         }
2084         return ret;
2085 }
2086
2087 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2088                                     struct extent_buffer *leaf,
2089                                     struct btrfs_extent_item *ei)
2090 {
2091         u64 flags = btrfs_extent_flags(leaf, ei);
2092         if (extent_op->update_flags) {
2093                 flags |= extent_op->flags_to_set;
2094                 btrfs_set_extent_flags(leaf, ei, flags);
2095         }
2096
2097         if (extent_op->update_key) {
2098                 struct btrfs_tree_block_info *bi;
2099                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2100                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2101                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2102         }
2103 }
2104
2105 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2106                                  struct btrfs_root *root,
2107                                  struct btrfs_delayed_ref_node *node,
2108                                  struct btrfs_delayed_extent_op *extent_op)
2109 {
2110         struct btrfs_key key;
2111         struct btrfs_path *path;
2112         struct btrfs_extent_item *ei;
2113         struct extent_buffer *leaf;
2114         u32 item_size;
2115         int ret;
2116         int err = 0;
2117         int metadata = !extent_op->is_data;
2118
2119         if (trans->aborted)
2120                 return 0;
2121
2122         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2123                 metadata = 0;
2124
2125         path = btrfs_alloc_path();
2126         if (!path)
2127                 return -ENOMEM;
2128
2129         key.objectid = node->bytenr;
2130
2131         if (metadata) {
2132                 key.type = BTRFS_METADATA_ITEM_KEY;
2133                 key.offset = extent_op->level;
2134         } else {
2135                 key.type = BTRFS_EXTENT_ITEM_KEY;
2136                 key.offset = node->num_bytes;
2137         }
2138
2139 again:
2140         path->reada = 1;
2141         path->leave_spinning = 1;
2142         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2143                                 path, 0, 1);
2144         if (ret < 0) {
2145                 err = ret;
2146                 goto out;
2147         }
2148         if (ret > 0) {
2149                 if (metadata) {
2150                         if (path->slots[0] > 0) {
2151                                 path->slots[0]--;
2152                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2153                                                       path->slots[0]);
2154                                 if (key.objectid == node->bytenr &&
2155                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2156                                     key.offset == node->num_bytes)
2157                                         ret = 0;
2158                         }
2159                         if (ret > 0) {
2160                                 btrfs_release_path(path);
2161                                 metadata = 0;
2162
2163                                 key.objectid = node->bytenr;
2164                                 key.offset = node->num_bytes;
2165                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2166                                 goto again;
2167                         }
2168                 } else {
2169                         err = -EIO;
2170                         goto out;
2171                 }
2172         }
2173
2174         leaf = path->nodes[0];
2175         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2176 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2177         if (item_size < sizeof(*ei)) {
2178                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2179                                              path, (u64)-1, 0);
2180                 if (ret < 0) {
2181                         err = ret;
2182                         goto out;
2183                 }
2184                 leaf = path->nodes[0];
2185                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2186         }
2187 #endif
2188         BUG_ON(item_size < sizeof(*ei));
2189         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2190         __run_delayed_extent_op(extent_op, leaf, ei);
2191
2192         btrfs_mark_buffer_dirty(leaf);
2193 out:
2194         btrfs_free_path(path);
2195         return err;
2196 }
2197
2198 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2199                                 struct btrfs_root *root,
2200                                 struct btrfs_delayed_ref_node *node,
2201                                 struct btrfs_delayed_extent_op *extent_op,
2202                                 int insert_reserved)
2203 {
2204         int ret = 0;
2205         struct btrfs_delayed_tree_ref *ref;
2206         struct btrfs_key ins;
2207         u64 parent = 0;
2208         u64 ref_root = 0;
2209         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2210                                                  SKINNY_METADATA);
2211
2212         ref = btrfs_delayed_node_to_tree_ref(node);
2213         trace_run_delayed_tree_ref(node, ref, node->action);
2214
2215         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2216                 parent = ref->parent;
2217         ref_root = ref->root;
2218
2219         ins.objectid = node->bytenr;
2220         if (skinny_metadata) {
2221                 ins.offset = ref->level;
2222                 ins.type = BTRFS_METADATA_ITEM_KEY;
2223         } else {
2224                 ins.offset = node->num_bytes;
2225                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2226         }
2227
2228         BUG_ON(node->ref_mod != 1);
2229         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2230                 BUG_ON(!extent_op || !extent_op->update_flags);
2231                 ret = alloc_reserved_tree_block(trans, root,
2232                                                 parent, ref_root,
2233                                                 extent_op->flags_to_set,
2234                                                 &extent_op->key,
2235                                                 ref->level, &ins,
2236                                                 node->no_quota);
2237         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2238                 ret = __btrfs_inc_extent_ref(trans, root, node,
2239                                              parent, ref_root,
2240                                              ref->level, 0, 1,
2241                                              extent_op);
2242         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2243                 ret = __btrfs_free_extent(trans, root, node,
2244                                           parent, ref_root,
2245                                           ref->level, 0, 1, extent_op);
2246         } else {
2247                 BUG();
2248         }
2249         return ret;
2250 }
2251
2252 /* helper function to actually process a single delayed ref entry */
2253 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2254                                struct btrfs_root *root,
2255                                struct btrfs_delayed_ref_node *node,
2256                                struct btrfs_delayed_extent_op *extent_op,
2257                                int insert_reserved)
2258 {
2259         int ret = 0;
2260
2261         if (trans->aborted) {
2262                 if (insert_reserved)
2263                         btrfs_pin_extent(root, node->bytenr,
2264                                          node->num_bytes, 1);
2265                 return 0;
2266         }
2267
2268         if (btrfs_delayed_ref_is_head(node)) {
2269                 struct btrfs_delayed_ref_head *head;
2270                 /*
2271                  * we've hit the end of the chain and we were supposed
2272                  * to insert this extent into the tree.  But, it got
2273                  * deleted before we ever needed to insert it, so all
2274                  * we have to do is clean up the accounting
2275                  */
2276                 BUG_ON(extent_op);
2277                 head = btrfs_delayed_node_to_head(node);
2278                 trace_run_delayed_ref_head(node, head, node->action);
2279
2280                 if (insert_reserved) {
2281                         btrfs_pin_extent(root, node->bytenr,
2282                                          node->num_bytes, 1);
2283                         if (head->is_data) {
2284                                 ret = btrfs_del_csums(trans, root,
2285                                                       node->bytenr,
2286                                                       node->num_bytes);
2287                         }
2288                 }
2289                 return ret;
2290         }
2291
2292         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2293             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2294                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2295                                            insert_reserved);
2296         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2297                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2298                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2299                                            insert_reserved);
2300         else
2301                 BUG();
2302         return ret;
2303 }
2304
2305 static inline struct btrfs_delayed_ref_node *
2306 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2307 {
2308         struct btrfs_delayed_ref_node *ref;
2309
2310         if (list_empty(&head->ref_list))
2311                 return NULL;
2312
2313         /*
2314          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2315          * This is to prevent a ref count from going down to zero, which deletes
2316          * the extent item from the extent tree, when there still are references
2317          * to add, which would fail because they would not find the extent item.
2318          */
2319         list_for_each_entry(ref, &head->ref_list, list) {
2320                 if (ref->action == BTRFS_ADD_DELAYED_REF)
2321                         return ref;
2322         }
2323
2324         return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2325                           list);
2326 }
2327
2328 /*
2329  * Returns 0 on success or if called with an already aborted transaction.
2330  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2331  */
2332 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2333                                              struct btrfs_root *root,
2334                                              unsigned long nr)
2335 {
2336         struct btrfs_delayed_ref_root *delayed_refs;
2337         struct btrfs_delayed_ref_node *ref;
2338         struct btrfs_delayed_ref_head *locked_ref = NULL;
2339         struct btrfs_delayed_extent_op *extent_op;
2340         struct btrfs_fs_info *fs_info = root->fs_info;
2341         ktime_t start = ktime_get();
2342         int ret;
2343         unsigned long count = 0;
2344         unsigned long actual_count = 0;
2345         int must_insert_reserved = 0;
2346
2347         delayed_refs = &trans->transaction->delayed_refs;
2348         while (1) {
2349                 if (!locked_ref) {
2350                         if (count >= nr)
2351                                 break;
2352
2353                         spin_lock(&delayed_refs->lock);
2354                         locked_ref = btrfs_select_ref_head(trans);
2355                         if (!locked_ref) {
2356                                 spin_unlock(&delayed_refs->lock);
2357                                 break;
2358                         }
2359
2360                         /* grab the lock that says we are going to process
2361                          * all the refs for this head */
2362                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2363                         spin_unlock(&delayed_refs->lock);
2364                         /*
2365                          * we may have dropped the spin lock to get the head
2366                          * mutex lock, and that might have given someone else
2367                          * time to free the head.  If that's true, it has been
2368                          * removed from our list and we can move on.
2369                          */
2370                         if (ret == -EAGAIN) {
2371                                 locked_ref = NULL;
2372                                 count++;
2373                                 continue;
2374                         }
2375                 }
2376
2377                 spin_lock(&locked_ref->lock);
2378
2379                 /*
2380                  * locked_ref is the head node, so we have to go one
2381                  * node back for any delayed ref updates
2382                  */
2383                 ref = select_delayed_ref(locked_ref);
2384
2385                 if (ref && ref->seq &&
2386                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2387                         spin_unlock(&locked_ref->lock);
2388                         btrfs_delayed_ref_unlock(locked_ref);
2389                         spin_lock(&delayed_refs->lock);
2390                         locked_ref->processing = 0;
2391                         delayed_refs->num_heads_ready++;
2392                         spin_unlock(&delayed_refs->lock);
2393                         locked_ref = NULL;
2394                         cond_resched();
2395                         count++;
2396                         continue;
2397                 }
2398
2399                 /*
2400                  * record the must insert reserved flag before we
2401                  * drop the spin lock.
2402                  */
2403                 must_insert_reserved = locked_ref->must_insert_reserved;
2404                 locked_ref->must_insert_reserved = 0;
2405
2406                 extent_op = locked_ref->extent_op;
2407                 locked_ref->extent_op = NULL;
2408
2409                 if (!ref) {
2410
2411
2412                         /* All delayed refs have been processed, Go ahead
2413                          * and send the head node to run_one_delayed_ref,
2414                          * so that any accounting fixes can happen
2415                          */
2416                         ref = &locked_ref->node;
2417
2418                         if (extent_op && must_insert_reserved) {
2419                                 btrfs_free_delayed_extent_op(extent_op);
2420                                 extent_op = NULL;
2421                         }
2422
2423                         if (extent_op) {
2424                                 spin_unlock(&locked_ref->lock);
2425                                 ret = run_delayed_extent_op(trans, root,
2426                                                             ref, extent_op);
2427                                 btrfs_free_delayed_extent_op(extent_op);
2428
2429                                 if (ret) {
2430                                         /*
2431                                          * Need to reset must_insert_reserved if
2432                                          * there was an error so the abort stuff
2433                                          * can cleanup the reserved space
2434                                          * properly.
2435                                          */
2436                                         if (must_insert_reserved)
2437                                                 locked_ref->must_insert_reserved = 1;
2438                                         locked_ref->processing = 0;
2439                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2440                                         btrfs_delayed_ref_unlock(locked_ref);
2441                                         return ret;
2442                                 }
2443                                 continue;
2444                         }
2445
2446                         /*
2447                          * Need to drop our head ref lock and re-aqcuire the
2448                          * delayed ref lock and then re-check to make sure
2449                          * nobody got added.
2450                          */
2451                         spin_unlock(&locked_ref->lock);
2452                         spin_lock(&delayed_refs->lock);
2453                         spin_lock(&locked_ref->lock);
2454                         if (!list_empty(&locked_ref->ref_list) ||
2455                             locked_ref->extent_op) {
2456                                 spin_unlock(&locked_ref->lock);
2457                                 spin_unlock(&delayed_refs->lock);
2458                                 continue;
2459                         }
2460                         ref->in_tree = 0;
2461                         delayed_refs->num_heads--;
2462                         rb_erase(&locked_ref->href_node,
2463                                  &delayed_refs->href_root);
2464                         spin_unlock(&delayed_refs->lock);
2465                 } else {
2466                         actual_count++;
2467                         ref->in_tree = 0;
2468                         list_del(&ref->list);
2469                 }
2470                 atomic_dec(&delayed_refs->num_entries);
2471
2472                 if (!btrfs_delayed_ref_is_head(ref)) {
2473                         /*
2474                          * when we play the delayed ref, also correct the
2475                          * ref_mod on head
2476                          */
2477                         switch (ref->action) {
2478                         case BTRFS_ADD_DELAYED_REF:
2479                         case BTRFS_ADD_DELAYED_EXTENT:
2480                                 locked_ref->node.ref_mod -= ref->ref_mod;
2481                                 break;
2482                         case BTRFS_DROP_DELAYED_REF:
2483                                 locked_ref->node.ref_mod += ref->ref_mod;
2484                                 break;
2485                         default:
2486                                 WARN_ON(1);
2487                         }
2488                 }
2489                 spin_unlock(&locked_ref->lock);
2490
2491                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2492                                           must_insert_reserved);
2493
2494                 btrfs_free_delayed_extent_op(extent_op);
2495                 if (ret) {
2496                         locked_ref->processing = 0;
2497                         btrfs_delayed_ref_unlock(locked_ref);
2498                         btrfs_put_delayed_ref(ref);
2499                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2500                         return ret;
2501                 }
2502
2503                 /*
2504                  * If this node is a head, that means all the refs in this head
2505                  * have been dealt with, and we will pick the next head to deal
2506                  * with, so we must unlock the head and drop it from the cluster
2507                  * list before we release it.
2508                  */
2509                 if (btrfs_delayed_ref_is_head(ref)) {
2510                         if (locked_ref->is_data &&
2511                             locked_ref->total_ref_mod < 0) {
2512                                 spin_lock(&delayed_refs->lock);
2513                                 delayed_refs->pending_csums -= ref->num_bytes;
2514                                 spin_unlock(&delayed_refs->lock);
2515                         }
2516                         btrfs_delayed_ref_unlock(locked_ref);
2517                         locked_ref = NULL;
2518                 }
2519                 btrfs_put_delayed_ref(ref);
2520                 count++;
2521                 cond_resched();
2522         }
2523
2524         /*
2525          * We don't want to include ref heads since we can have empty ref heads
2526          * and those will drastically skew our runtime down since we just do
2527          * accounting, no actual extent tree updates.
2528          */
2529         if (actual_count > 0) {
2530                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2531                 u64 avg;
2532
2533                 /*
2534                  * We weigh the current average higher than our current runtime
2535                  * to avoid large swings in the average.
2536                  */
2537                 spin_lock(&delayed_refs->lock);
2538                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2539                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2540                 spin_unlock(&delayed_refs->lock);
2541         }
2542         return 0;
2543 }
2544
2545 #ifdef SCRAMBLE_DELAYED_REFS
2546 /*
2547  * Normally delayed refs get processed in ascending bytenr order. This
2548  * correlates in most cases to the order added. To expose dependencies on this
2549  * order, we start to process the tree in the middle instead of the beginning
2550  */
2551 static u64 find_middle(struct rb_root *root)
2552 {
2553         struct rb_node *n = root->rb_node;
2554         struct btrfs_delayed_ref_node *entry;
2555         int alt = 1;
2556         u64 middle;
2557         u64 first = 0, last = 0;
2558
2559         n = rb_first(root);
2560         if (n) {
2561                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2562                 first = entry->bytenr;
2563         }
2564         n = rb_last(root);
2565         if (n) {
2566                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2567                 last = entry->bytenr;
2568         }
2569         n = root->rb_node;
2570
2571         while (n) {
2572                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2573                 WARN_ON(!entry->in_tree);
2574
2575                 middle = entry->bytenr;
2576
2577                 if (alt)
2578                         n = n->rb_left;
2579                 else
2580                         n = n->rb_right;
2581
2582                 alt = 1 - alt;
2583         }
2584         return middle;
2585 }
2586 #endif
2587
2588 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2589 {
2590         u64 num_bytes;
2591
2592         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2593                              sizeof(struct btrfs_extent_inline_ref));
2594         if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2595                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2596
2597         /*
2598          * We don't ever fill up leaves all the way so multiply by 2 just to be
2599          * closer to what we're really going to want to ouse.
2600          */
2601         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2602 }
2603
2604 /*
2605  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2606  * would require to store the csums for that many bytes.
2607  */
2608 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2609 {
2610         u64 csum_size;
2611         u64 num_csums_per_leaf;
2612         u64 num_csums;
2613
2614         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
2615         num_csums_per_leaf = div64_u64(csum_size,
2616                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2617         num_csums = div64_u64(csum_bytes, root->sectorsize);
2618         num_csums += num_csums_per_leaf - 1;
2619         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2620         return num_csums;
2621 }
2622
2623 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2624                                        struct btrfs_root *root)
2625 {
2626         struct btrfs_block_rsv *global_rsv;
2627         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2628         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2629         u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2630         u64 num_bytes, num_dirty_bgs_bytes;
2631         int ret = 0;
2632
2633         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2634         num_heads = heads_to_leaves(root, num_heads);
2635         if (num_heads > 1)
2636                 num_bytes += (num_heads - 1) * root->nodesize;
2637         num_bytes <<= 1;
2638         num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2639         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2640                                                              num_dirty_bgs);
2641         global_rsv = &root->fs_info->global_block_rsv;
2642
2643         /*
2644          * If we can't allocate any more chunks lets make sure we have _lots_ of
2645          * wiggle room since running delayed refs can create more delayed refs.
2646          */
2647         if (global_rsv->space_info->full) {
2648                 num_dirty_bgs_bytes <<= 1;
2649                 num_bytes <<= 1;
2650         }
2651
2652         spin_lock(&global_rsv->lock);
2653         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2654                 ret = 1;
2655         spin_unlock(&global_rsv->lock);
2656         return ret;
2657 }
2658
2659 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2660                                        struct btrfs_root *root)
2661 {
2662         struct btrfs_fs_info *fs_info = root->fs_info;
2663         u64 num_entries =
2664                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2665         u64 avg_runtime;
2666         u64 val;
2667
2668         smp_mb();
2669         avg_runtime = fs_info->avg_delayed_ref_runtime;
2670         val = num_entries * avg_runtime;
2671         if (num_entries * avg_runtime >= NSEC_PER_SEC)
2672                 return 1;
2673         if (val >= NSEC_PER_SEC / 2)
2674                 return 2;
2675
2676         return btrfs_check_space_for_delayed_refs(trans, root);
2677 }
2678
2679 struct async_delayed_refs {
2680         struct btrfs_root *root;
2681         int count;
2682         int error;
2683         int sync;
2684         struct completion wait;
2685         struct btrfs_work work;
2686 };
2687
2688 static void delayed_ref_async_start(struct btrfs_work *work)
2689 {
2690         struct async_delayed_refs *async;
2691         struct btrfs_trans_handle *trans;
2692         int ret;
2693
2694         async = container_of(work, struct async_delayed_refs, work);
2695
2696         trans = btrfs_join_transaction(async->root);
2697         if (IS_ERR(trans)) {
2698                 async->error = PTR_ERR(trans);
2699                 goto done;
2700         }
2701
2702         /*
2703          * trans->sync means that when we call end_transaciton, we won't
2704          * wait on delayed refs
2705          */
2706         trans->sync = true;
2707         ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2708         if (ret)
2709                 async->error = ret;
2710
2711         ret = btrfs_end_transaction(trans, async->root);
2712         if (ret && !async->error)
2713                 async->error = ret;
2714 done:
2715         if (async->sync)
2716                 complete(&async->wait);
2717         else
2718                 kfree(async);
2719 }
2720
2721 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2722                                  unsigned long count, int wait)
2723 {
2724         struct async_delayed_refs *async;
2725         int ret;
2726
2727         async = kmalloc(sizeof(*async), GFP_NOFS);
2728         if (!async)
2729                 return -ENOMEM;
2730
2731         async->root = root->fs_info->tree_root;
2732         async->count = count;
2733         async->error = 0;
2734         if (wait)
2735                 async->sync = 1;
2736         else
2737                 async->sync = 0;
2738         init_completion(&async->wait);
2739
2740         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2741                         delayed_ref_async_start, NULL, NULL);
2742
2743         btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2744
2745         if (wait) {
2746                 wait_for_completion(&async->wait);
2747                 ret = async->error;
2748                 kfree(async);
2749                 return ret;
2750         }
2751         return 0;
2752 }
2753
2754 /*
2755  * this starts processing the delayed reference count updates and
2756  * extent insertions we have queued up so far.  count can be
2757  * 0, which means to process everything in the tree at the start
2758  * of the run (but not newly added entries), or it can be some target
2759  * number you'd like to process.
2760  *
2761  * Returns 0 on success or if called with an aborted transaction
2762  * Returns <0 on error and aborts the transaction
2763  */
2764 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2765                            struct btrfs_root *root, unsigned long count)
2766 {
2767         struct rb_node *node;
2768         struct btrfs_delayed_ref_root *delayed_refs;
2769         struct btrfs_delayed_ref_head *head;
2770         int ret;
2771         int run_all = count == (unsigned long)-1;
2772
2773         /* We'll clean this up in btrfs_cleanup_transaction */
2774         if (trans->aborted)
2775                 return 0;
2776
2777         if (root == root->fs_info->extent_root)
2778                 root = root->fs_info->tree_root;
2779
2780         delayed_refs = &trans->transaction->delayed_refs;
2781         if (count == 0)
2782                 count = atomic_read(&delayed_refs->num_entries) * 2;
2783
2784 again:
2785 #ifdef SCRAMBLE_DELAYED_REFS
2786         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2787 #endif
2788         ret = __btrfs_run_delayed_refs(trans, root, count);
2789         if (ret < 0) {
2790                 btrfs_abort_transaction(trans, root, ret);
2791                 return ret;
2792         }
2793
2794         if (run_all) {
2795                 if (!list_empty(&trans->new_bgs))
2796                         btrfs_create_pending_block_groups(trans, root);
2797
2798                 spin_lock(&delayed_refs->lock);
2799                 node = rb_first(&delayed_refs->href_root);
2800                 if (!node) {
2801                         spin_unlock(&delayed_refs->lock);
2802                         goto out;
2803                 }
2804                 count = (unsigned long)-1;
2805
2806                 while (node) {
2807                         head = rb_entry(node, struct btrfs_delayed_ref_head,
2808                                         href_node);
2809                         if (btrfs_delayed_ref_is_head(&head->node)) {
2810                                 struct btrfs_delayed_ref_node *ref;
2811
2812                                 ref = &head->node;
2813                                 atomic_inc(&ref->refs);
2814
2815                                 spin_unlock(&delayed_refs->lock);
2816                                 /*
2817                                  * Mutex was contended, block until it's
2818                                  * released and try again
2819                                  */
2820                                 mutex_lock(&head->mutex);
2821                                 mutex_unlock(&head->mutex);
2822
2823                                 btrfs_put_delayed_ref(ref);
2824                                 cond_resched();
2825                                 goto again;
2826                         } else {
2827                                 WARN_ON(1);
2828                         }
2829                         node = rb_next(node);
2830                 }
2831                 spin_unlock(&delayed_refs->lock);
2832                 cond_resched();
2833                 goto again;
2834         }
2835 out:
2836         assert_qgroups_uptodate(trans);
2837         return 0;
2838 }
2839
2840 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2841                                 struct btrfs_root *root,
2842                                 u64 bytenr, u64 num_bytes, u64 flags,
2843                                 int level, int is_data)
2844 {
2845         struct btrfs_delayed_extent_op *extent_op;
2846         int ret;
2847
2848         extent_op = btrfs_alloc_delayed_extent_op();
2849         if (!extent_op)
2850                 return -ENOMEM;
2851
2852         extent_op->flags_to_set = flags;
2853         extent_op->update_flags = 1;
2854         extent_op->update_key = 0;
2855         extent_op->is_data = is_data ? 1 : 0;
2856         extent_op->level = level;
2857
2858         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2859                                           num_bytes, extent_op);
2860         if (ret)
2861                 btrfs_free_delayed_extent_op(extent_op);
2862         return ret;
2863 }
2864
2865 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2866                                       struct btrfs_root *root,
2867                                       struct btrfs_path *path,
2868                                       u64 objectid, u64 offset, u64 bytenr)
2869 {
2870         struct btrfs_delayed_ref_head *head;
2871         struct btrfs_delayed_ref_node *ref;
2872         struct btrfs_delayed_data_ref *data_ref;
2873         struct btrfs_delayed_ref_root *delayed_refs;
2874         int ret = 0;
2875
2876         delayed_refs = &trans->transaction->delayed_refs;
2877         spin_lock(&delayed_refs->lock);
2878         head = btrfs_find_delayed_ref_head(trans, bytenr);
2879         if (!head) {
2880                 spin_unlock(&delayed_refs->lock);
2881                 return 0;
2882         }
2883
2884         if (!mutex_trylock(&head->mutex)) {
2885                 atomic_inc(&head->node.refs);
2886                 spin_unlock(&delayed_refs->lock);
2887
2888                 btrfs_release_path(path);
2889
2890                 /*
2891                  * Mutex was contended, block until it's released and let
2892                  * caller try again
2893                  */
2894                 mutex_lock(&head->mutex);
2895                 mutex_unlock(&head->mutex);
2896                 btrfs_put_delayed_ref(&head->node);
2897                 return -EAGAIN;
2898         }
2899         spin_unlock(&delayed_refs->lock);
2900
2901         spin_lock(&head->lock);
2902         list_for_each_entry(ref, &head->ref_list, list) {
2903                 /* If it's a shared ref we know a cross reference exists */
2904                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2905                         ret = 1;
2906                         break;
2907                 }
2908
2909                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2910
2911                 /*
2912                  * If our ref doesn't match the one we're currently looking at
2913                  * then we have a cross reference.
2914                  */
2915                 if (data_ref->root != root->root_key.objectid ||
2916                     data_ref->objectid != objectid ||
2917                     data_ref->offset != offset) {
2918                         ret = 1;
2919                         break;
2920                 }
2921         }
2922         spin_unlock(&head->lock);
2923         mutex_unlock(&head->mutex);
2924         return ret;
2925 }
2926
2927 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2928                                         struct btrfs_root *root,
2929                                         struct btrfs_path *path,
2930                                         u64 objectid, u64 offset, u64 bytenr)
2931 {
2932         struct btrfs_root *extent_root = root->fs_info->extent_root;
2933         struct extent_buffer *leaf;
2934         struct btrfs_extent_data_ref *ref;
2935         struct btrfs_extent_inline_ref *iref;
2936         struct btrfs_extent_item *ei;
2937         struct btrfs_key key;
2938         u32 item_size;
2939         int ret;
2940
2941         key.objectid = bytenr;
2942         key.offset = (u64)-1;
2943         key.type = BTRFS_EXTENT_ITEM_KEY;
2944
2945         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2946         if (ret < 0)
2947                 goto out;
2948         BUG_ON(ret == 0); /* Corruption */
2949
2950         ret = -ENOENT;
2951         if (path->slots[0] == 0)
2952                 goto out;
2953
2954         path->slots[0]--;
2955         leaf = path->nodes[0];
2956         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2957
2958         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2959                 goto out;
2960
2961         ret = 1;
2962         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2963 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2964         if (item_size < sizeof(*ei)) {
2965                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2966                 goto out;
2967         }
2968 #endif
2969         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2970
2971         if (item_size != sizeof(*ei) +
2972             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2973                 goto out;
2974
2975         if (btrfs_extent_generation(leaf, ei) <=
2976             btrfs_root_last_snapshot(&root->root_item))
2977                 goto out;
2978
2979         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2980         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2981             BTRFS_EXTENT_DATA_REF_KEY)
2982                 goto out;
2983
2984         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2985         if (btrfs_extent_refs(leaf, ei) !=
2986             btrfs_extent_data_ref_count(leaf, ref) ||
2987             btrfs_extent_data_ref_root(leaf, ref) !=
2988             root->root_key.objectid ||
2989             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2990             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2991                 goto out;
2992
2993         ret = 0;
2994 out:
2995         return ret;
2996 }
2997
2998 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2999                           struct btrfs_root *root,
3000                           u64 objectid, u64 offset, u64 bytenr)
3001 {
3002         struct btrfs_path *path;
3003         int ret;
3004         int ret2;
3005
3006         path = btrfs_alloc_path();
3007         if (!path)
3008                 return -ENOENT;
3009
3010         do {
3011                 ret = check_committed_ref(trans, root, path, objectid,
3012                                           offset, bytenr);
3013                 if (ret && ret != -ENOENT)
3014                         goto out;
3015
3016                 ret2 = check_delayed_ref(trans, root, path, objectid,
3017                                          offset, bytenr);
3018         } while (ret2 == -EAGAIN);
3019
3020         if (ret2 && ret2 != -ENOENT) {
3021                 ret = ret2;
3022                 goto out;
3023         }
3024
3025         if (ret != -ENOENT || ret2 != -ENOENT)
3026                 ret = 0;
3027 out:
3028         btrfs_free_path(path);
3029         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3030                 WARN_ON(ret > 0);
3031         return ret;
3032 }
3033
3034 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3035                            struct btrfs_root *root,
3036                            struct extent_buffer *buf,
3037                            int full_backref, int inc)
3038 {
3039         u64 bytenr;
3040         u64 num_bytes;
3041         u64 parent;
3042         u64 ref_root;
3043         u32 nritems;
3044         struct btrfs_key key;
3045         struct btrfs_file_extent_item *fi;
3046         int i;
3047         int level;
3048         int ret = 0;
3049         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3050                             u64, u64, u64, u64, u64, u64, int);
3051
3052
3053         if (btrfs_test_is_dummy_root(root))
3054                 return 0;
3055
3056         ref_root = btrfs_header_owner(buf);
3057         nritems = btrfs_header_nritems(buf);
3058         level = btrfs_header_level(buf);
3059
3060         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3061                 return 0;
3062
3063         if (inc)
3064                 process_func = btrfs_inc_extent_ref;
3065         else
3066                 process_func = btrfs_free_extent;
3067
3068         if (full_backref)
3069                 parent = buf->start;
3070         else
3071                 parent = 0;
3072
3073         for (i = 0; i < nritems; i++) {
3074                 if (level == 0) {
3075                         btrfs_item_key_to_cpu(buf, &key, i);
3076                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3077                                 continue;
3078                         fi = btrfs_item_ptr(buf, i,
3079                                             struct btrfs_file_extent_item);
3080                         if (btrfs_file_extent_type(buf, fi) ==
3081                             BTRFS_FILE_EXTENT_INLINE)
3082                                 continue;
3083                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3084                         if (bytenr == 0)
3085                                 continue;
3086
3087                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3088                         key.offset -= btrfs_file_extent_offset(buf, fi);
3089                         ret = process_func(trans, root, bytenr, num_bytes,
3090                                            parent, ref_root, key.objectid,
3091                                            key.offset, 1);
3092                         if (ret)
3093                                 goto fail;
3094                 } else {
3095                         bytenr = btrfs_node_blockptr(buf, i);
3096                         num_bytes = root->nodesize;
3097                         ret = process_func(trans, root, bytenr, num_bytes,
3098                                            parent, ref_root, level - 1, 0,
3099                                            1);
3100                         if (ret)
3101                                 goto fail;
3102                 }
3103         }
3104         return 0;
3105 fail:
3106         return ret;
3107 }
3108
3109 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3110                   struct extent_buffer *buf, int full_backref)
3111 {
3112         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3113 }
3114
3115 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3116                   struct extent_buffer *buf, int full_backref)
3117 {
3118         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3119 }
3120
3121 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3122                                  struct btrfs_root *root,
3123                                  struct btrfs_path *path,
3124                                  struct btrfs_block_group_cache *cache)
3125 {
3126         int ret;
3127         struct btrfs_root *extent_root = root->fs_info->extent_root;
3128         unsigned long bi;
3129         struct extent_buffer *leaf;
3130
3131         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3132         if (ret) {
3133                 if (ret > 0)
3134                         ret = -ENOENT;
3135                 goto fail;
3136         }
3137
3138         leaf = path->nodes[0];
3139         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3140         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3141         btrfs_mark_buffer_dirty(leaf);
3142 fail:
3143         btrfs_release_path(path);
3144         return ret;
3145
3146 }
3147
3148 static struct btrfs_block_group_cache *
3149 next_block_group(struct btrfs_root *root,
3150                  struct btrfs_block_group_cache *cache)
3151 {
3152         struct rb_node *node;
3153
3154         spin_lock(&root->fs_info->block_group_cache_lock);
3155
3156         /* If our block group was removed, we need a full search. */
3157         if (RB_EMPTY_NODE(&cache->cache_node)) {
3158                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3159
3160                 spin_unlock(&root->fs_info->block_group_cache_lock);
3161                 btrfs_put_block_group(cache);
3162                 cache = btrfs_lookup_first_block_group(root->fs_info,
3163                                                        next_bytenr);
3164                 return cache;
3165         }
3166         node = rb_next(&cache->cache_node);
3167         btrfs_put_block_group(cache);
3168         if (node) {
3169                 cache = rb_entry(node, struct btrfs_block_group_cache,
3170                                  cache_node);
3171                 btrfs_get_block_group(cache);
3172         } else
3173                 cache = NULL;
3174         spin_unlock(&root->fs_info->block_group_cache_lock);
3175         return cache;
3176 }
3177
3178 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3179                             struct btrfs_trans_handle *trans,
3180                             struct btrfs_path *path)
3181 {
3182         struct btrfs_root *root = block_group->fs_info->tree_root;
3183         struct inode *inode = NULL;
3184         u64 alloc_hint = 0;
3185         int dcs = BTRFS_DC_ERROR;
3186         u64 num_pages = 0;
3187         int retries = 0;
3188         int ret = 0;
3189
3190         /*
3191          * If this block group is smaller than 100 megs don't bother caching the
3192          * block group.
3193          */
3194         if (block_group->key.offset < (100 * 1024 * 1024)) {
3195                 spin_lock(&block_group->lock);
3196                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3197                 spin_unlock(&block_group->lock);
3198                 return 0;
3199         }
3200
3201         if (trans->aborted)
3202                 return 0;
3203 again:
3204         inode = lookup_free_space_inode(root, block_group, path);
3205         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3206                 ret = PTR_ERR(inode);
3207                 btrfs_release_path(path);
3208                 goto out;
3209         }
3210
3211         if (IS_ERR(inode)) {
3212                 BUG_ON(retries);
3213                 retries++;
3214
3215                 if (block_group->ro)
3216                         goto out_free;
3217
3218                 ret = create_free_space_inode(root, trans, block_group, path);
3219                 if (ret)
3220                         goto out_free;
3221                 goto again;
3222         }
3223
3224         /* We've already setup this transaction, go ahead and exit */
3225         if (block_group->cache_generation == trans->transid &&
3226             i_size_read(inode)) {
3227                 dcs = BTRFS_DC_SETUP;
3228                 goto out_put;
3229         }
3230
3231         /*
3232          * We want to set the generation to 0, that way if anything goes wrong
3233          * from here on out we know not to trust this cache when we load up next
3234          * time.
3235          */
3236         BTRFS_I(inode)->generation = 0;
3237         ret = btrfs_update_inode(trans, root, inode);
3238         if (ret) {
3239                 /*
3240                  * So theoretically we could recover from this, simply set the
3241                  * super cache generation to 0 so we know to invalidate the
3242                  * cache, but then we'd have to keep track of the block groups
3243                  * that fail this way so we know we _have_ to reset this cache
3244                  * before the next commit or risk reading stale cache.  So to
3245                  * limit our exposure to horrible edge cases lets just abort the
3246                  * transaction, this only happens in really bad situations
3247                  * anyway.
3248                  */
3249                 btrfs_abort_transaction(trans, root, ret);
3250                 goto out_put;
3251         }
3252         WARN_ON(ret);
3253
3254         if (i_size_read(inode) > 0) {
3255                 ret = btrfs_check_trunc_cache_free_space(root,
3256                                         &root->fs_info->global_block_rsv);
3257                 if (ret)
3258                         goto out_put;
3259
3260                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3261                 if (ret)
3262                         goto out_put;
3263         }
3264
3265         spin_lock(&block_group->lock);
3266         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3267             !btrfs_test_opt(root, SPACE_CACHE)) {
3268                 /*
3269                  * don't bother trying to write stuff out _if_
3270                  * a) we're not cached,
3271                  * b) we're with nospace_cache mount option.
3272                  */
3273                 dcs = BTRFS_DC_WRITTEN;
3274                 spin_unlock(&block_group->lock);
3275                 goto out_put;
3276         }
3277         spin_unlock(&block_group->lock);
3278
3279         /*
3280          * Try to preallocate enough space based on how big the block group is.
3281          * Keep in mind this has to include any pinned space which could end up
3282          * taking up quite a bit since it's not folded into the other space
3283          * cache.
3284          */
3285         num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
3286         if (!num_pages)
3287                 num_pages = 1;
3288
3289         num_pages *= 16;
3290         num_pages *= PAGE_CACHE_SIZE;
3291
3292         ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
3293         if (ret)
3294                 goto out_put;
3295
3296         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3297                                               num_pages, num_pages,
3298                                               &alloc_hint);
3299         if (!ret)
3300                 dcs = BTRFS_DC_SETUP;
3301         btrfs_free_reserved_data_space(inode, num_pages);
3302
3303 out_put:
3304         iput(inode);
3305 out_free:
3306         btrfs_release_path(path);
3307 out:
3308         spin_lock(&block_group->lock);
3309         if (!ret && dcs == BTRFS_DC_SETUP)
3310                 block_group->cache_generation = trans->transid;
3311         block_group->disk_cache_state = dcs;
3312         spin_unlock(&block_group->lock);
3313
3314         return ret;
3315 }
3316
3317 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3318                             struct btrfs_root *root)
3319 {
3320         struct btrfs_block_group_cache *cache, *tmp;
3321         struct btrfs_transaction *cur_trans = trans->transaction;
3322         struct btrfs_path *path;
3323
3324         if (list_empty(&cur_trans->dirty_bgs) ||
3325             !btrfs_test_opt(root, SPACE_CACHE))
3326                 return 0;
3327
3328         path = btrfs_alloc_path();
3329         if (!path)
3330                 return -ENOMEM;
3331
3332         /* Could add new block groups, use _safe just in case */
3333         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3334                                  dirty_list) {
3335                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3336                         cache_save_setup(cache, trans, path);
3337         }
3338
3339         btrfs_free_path(path);
3340         return 0;
3341 }
3342
3343 /*
3344  * transaction commit does final block group cache writeback during a
3345  * critical section where nothing is allowed to change the FS.  This is
3346  * required in order for the cache to actually match the block group,
3347  * but can introduce a lot of latency into the commit.
3348  *
3349  * So, btrfs_start_dirty_block_groups is here to kick off block group
3350  * cache IO.  There's a chance we'll have to redo some of it if the
3351  * block group changes again during the commit, but it greatly reduces
3352  * the commit latency by getting rid of the easy block groups while
3353  * we're still allowing others to join the commit.
3354  */
3355 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3356                                    struct btrfs_root *root)
3357 {
3358         struct btrfs_block_group_cache *cache;
3359         struct btrfs_transaction *cur_trans = trans->transaction;
3360         int ret = 0;
3361         int should_put;
3362         struct btrfs_path *path = NULL;
3363         LIST_HEAD(dirty);
3364         struct list_head *io = &cur_trans->io_bgs;
3365         int num_started = 0;
3366         int loops = 0;
3367
3368         spin_lock(&cur_trans->dirty_bgs_lock);
3369         if (list_empty(&cur_trans->dirty_bgs)) {
3370                 spin_unlock(&cur_trans->dirty_bgs_lock);
3371                 return 0;
3372         }
3373         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3374         spin_unlock(&cur_trans->dirty_bgs_lock);
3375
3376 again:
3377         /*
3378          * make sure all the block groups on our dirty list actually
3379          * exist
3380          */
3381         btrfs_create_pending_block_groups(trans, root);
3382
3383         if (!path) {
3384                 path = btrfs_alloc_path();
3385                 if (!path)
3386                         return -ENOMEM;
3387         }
3388
3389         /*
3390          * cache_write_mutex is here only to save us from balance or automatic
3391          * removal of empty block groups deleting this block group while we are
3392          * writing out the cache
3393          */
3394         mutex_lock(&trans->transaction->cache_write_mutex);
3395         while (!list_empty(&dirty)) {
3396                 cache = list_first_entry(&dirty,
3397                                          struct btrfs_block_group_cache,
3398                                          dirty_list);
3399                 /*
3400                  * this can happen if something re-dirties a block
3401                  * group that is already under IO.  Just wait for it to
3402                  * finish and then do it all again
3403                  */
3404                 if (!list_empty(&cache->io_list)) {
3405                         list_del_init(&cache->io_list);
3406                         btrfs_wait_cache_io(root, trans, cache,
3407                                             &cache->io_ctl, path,
3408                                             cache->key.objectid);
3409                         btrfs_put_block_group(cache);
3410                 }
3411
3412
3413                 /*
3414                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3415                  * if it should update the cache_state.  Don't delete
3416                  * until after we wait.
3417                  *
3418                  * Since we're not running in the commit critical section
3419                  * we need the dirty_bgs_lock to protect from update_block_group
3420                  */
3421                 spin_lock(&cur_trans->dirty_bgs_lock);
3422                 list_del_init(&cache->dirty_list);
3423                 spin_unlock(&cur_trans->dirty_bgs_lock);
3424
3425                 should_put = 1;
3426
3427                 cache_save_setup(cache, trans, path);
3428
3429                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3430                         cache->io_ctl.inode = NULL;
3431                         ret = btrfs_write_out_cache(root, trans, cache, path);
3432                         if (ret == 0 && cache->io_ctl.inode) {
3433                                 num_started++;
3434                                 should_put = 0;
3435
3436                                 /*
3437                                  * the cache_write_mutex is protecting
3438                                  * the io_list
3439                                  */
3440                                 list_add_tail(&cache->io_list, io);
3441                         } else {
3442                                 /*
3443                                  * if we failed to write the cache, the
3444                                  * generation will be bad and life goes on
3445                                  */
3446                                 ret = 0;
3447                         }
3448                 }
3449                 if (!ret) {
3450                         ret = write_one_cache_group(trans, root, path, cache);
3451                         /*
3452                          * Our block group might still be attached to the list
3453                          * of new block groups in the transaction handle of some
3454                          * other task (struct btrfs_trans_handle->new_bgs). This
3455                          * means its block group item isn't yet in the extent
3456                          * tree. If this happens ignore the error, as we will
3457                          * try again later in the critical section of the
3458                          * transaction commit.
3459                          */
3460                         if (ret == -ENOENT) {
3461                                 ret = 0;
3462                                 spin_lock(&cur_trans->dirty_bgs_lock);
3463                                 if (list_empty(&cache->dirty_list)) {
3464                                         list_add_tail(&cache->dirty_list,
3465                                                       &cur_trans->dirty_bgs);
3466                                         btrfs_get_block_group(cache);
3467                                 }
3468                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3469                         } else if (ret) {
3470                                 btrfs_abort_transaction(trans, root, ret);
3471                         }
3472                 }
3473
3474                 /* if its not on the io list, we need to put the block group */
3475                 if (should_put)
3476                         btrfs_put_block_group(cache);
3477
3478                 if (ret)
3479                         break;
3480
3481                 /*
3482                  * Avoid blocking other tasks for too long. It might even save
3483                  * us from writing caches for block groups that are going to be
3484                  * removed.
3485                  */
3486                 mutex_unlock(&trans->transaction->cache_write_mutex);
3487                 mutex_lock(&trans->transaction->cache_write_mutex);
3488         }
3489         mutex_unlock(&trans->transaction->cache_write_mutex);
3490
3491         /*
3492          * go through delayed refs for all the stuff we've just kicked off
3493          * and then loop back (just once)
3494          */
3495         ret = btrfs_run_delayed_refs(trans, root, 0);
3496         if (!ret && loops == 0) {
3497                 loops++;
3498                 spin_lock(&cur_trans->dirty_bgs_lock);
3499                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3500                 /*
3501                  * dirty_bgs_lock protects us from concurrent block group
3502                  * deletes too (not just cache_write_mutex).
3503                  */
3504                 if (!list_empty(&dirty)) {
3505                         spin_unlock(&cur_trans->dirty_bgs_lock);
3506                         goto again;
3507                 }
3508                 spin_unlock(&cur_trans->dirty_bgs_lock);
3509         }
3510
3511         btrfs_free_path(path);
3512         return ret;
3513 }
3514
3515 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3516                                    struct btrfs_root *root)
3517 {
3518         struct btrfs_block_group_cache *cache;
3519         struct btrfs_transaction *cur_trans = trans->transaction;
3520         int ret = 0;
3521         int should_put;
3522         struct btrfs_path *path;
3523         struct list_head *io = &cur_trans->io_bgs;
3524         int num_started = 0;
3525
3526         path = btrfs_alloc_path();
3527         if (!path)
3528                 return -ENOMEM;
3529
3530         /*
3531          * We don't need the lock here since we are protected by the transaction
3532          * commit.  We want to do the cache_save_setup first and then run the
3533          * delayed refs to make sure we have the best chance at doing this all
3534          * in one shot.
3535          */
3536         while (!list_empty(&cur_trans->dirty_bgs)) {
3537                 cache = list_first_entry(&cur_trans->dirty_bgs,
3538                                          struct btrfs_block_group_cache,
3539                                          dirty_list);
3540
3541                 /*
3542                  * this can happen if cache_save_setup re-dirties a block
3543                  * group that is already under IO.  Just wait for it to
3544                  * finish and then do it all again
3545                  */
3546                 if (!list_empty(&cache->io_list)) {
3547                         list_del_init(&cache->io_list);
3548                         btrfs_wait_cache_io(root, trans, cache,
3549                                             &cache->io_ctl, path,
3550                                             cache->key.objectid);
3551                         btrfs_put_block_group(cache);
3552                 }
3553
3554                 /*
3555                  * don't remove from the dirty list until after we've waited
3556                  * on any pending IO
3557                  */
3558                 list_del_init(&cache->dirty_list);
3559                 should_put = 1;
3560
3561                 cache_save_setup(cache, trans, path);
3562
3563                 if (!ret)
3564                         ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3565
3566                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3567                         cache->io_ctl.inode = NULL;
3568                         ret = btrfs_write_out_cache(root, trans, cache, path);
3569                         if (ret == 0 && cache->io_ctl.inode) {
3570                                 num_started++;
3571                                 should_put = 0;
3572                                 list_add_tail(&cache->io_list, io);
3573                         } else {
3574                                 /*
3575                                  * if we failed to write the cache, the
3576                                  * generation will be bad and life goes on
3577                                  */
3578                                 ret = 0;
3579                         }
3580                 }
3581                 if (!ret) {
3582                         ret = write_one_cache_group(trans, root, path, cache);
3583                         if (ret)
3584                                 btrfs_abort_transaction(trans, root, ret);
3585                 }
3586
3587                 /* if its not on the io list, we need to put the block group */
3588                 if (should_put)
3589                         btrfs_put_block_group(cache);
3590         }
3591
3592         while (!list_empty(io)) {
3593                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3594                                          io_list);
3595                 list_del_init(&cache->io_list);
3596                 btrfs_wait_cache_io(root, trans, cache,
3597                                     &cache->io_ctl, path, cache->key.objectid);
3598                 btrfs_put_block_group(cache);
3599         }
3600
3601         btrfs_free_path(path);
3602         return ret;
3603 }
3604
3605 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3606 {
3607         struct btrfs_block_group_cache *block_group;
3608         int readonly = 0;
3609
3610         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3611         if (!block_group || block_group->ro)
3612                 readonly = 1;
3613         if (block_group)
3614                 btrfs_put_block_group(block_group);
3615         return readonly;
3616 }
3617
3618 static const char *alloc_name(u64 flags)
3619 {
3620         switch (flags) {
3621         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3622                 return "mixed";
3623         case BTRFS_BLOCK_GROUP_METADATA:
3624                 return "metadata";
3625         case BTRFS_BLOCK_GROUP_DATA:
3626                 return "data";
3627         case BTRFS_BLOCK_GROUP_SYSTEM:
3628                 return "system";
3629         default:
3630                 WARN_ON(1);
3631                 return "invalid-combination";
3632         };
3633 }
3634
3635 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3636                              u64 total_bytes, u64 bytes_used,
3637                              struct btrfs_space_info **space_info)
3638 {
3639         struct btrfs_space_info *found;
3640         int i;
3641         int factor;
3642         int ret;
3643
3644         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3645                      BTRFS_BLOCK_GROUP_RAID10))
3646                 factor = 2;
3647         else
3648                 factor = 1;
3649
3650         found = __find_space_info(info, flags);
3651         if (found) {
3652                 spin_lock(&found->lock);
3653                 found->total_bytes += total_bytes;
3654                 found->disk_total += total_bytes * factor;
3655                 found->bytes_used += bytes_used;
3656                 found->disk_used += bytes_used * factor;
3657                 if (total_bytes > 0)
3658                         found->full = 0;
3659                 spin_unlock(&found->lock);
3660                 *space_info = found;
3661                 return 0;
3662         }
3663         found = kzalloc(sizeof(*found), GFP_NOFS);
3664         if (!found)
3665                 return -ENOMEM;
3666
3667         ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3668         if (ret) {
3669                 kfree(found);
3670                 return ret;
3671         }
3672
3673         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3674                 INIT_LIST_HEAD(&found->block_groups[i]);
3675         init_rwsem(&found->groups_sem);
3676         spin_lock_init(&found->lock);
3677         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3678         found->total_bytes = total_bytes;
3679         found->disk_total = total_bytes * factor;
3680         found->bytes_used = bytes_used;
3681         found->disk_used = bytes_used * factor;
3682         found->bytes_pinned = 0;
3683         found->bytes_reserved = 0;
3684         found->bytes_readonly = 0;
3685         found->bytes_may_use = 0;
3686         if (total_bytes > 0)
3687                 found->full = 0;
3688         else
3689                 found->full = 1;
3690         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3691         found->chunk_alloc = 0;
3692         found->flush = 0;
3693         init_waitqueue_head(&found->wait);
3694         INIT_LIST_HEAD(&found->ro_bgs);
3695
3696         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3697                                     info->space_info_kobj, "%s",
3698                                     alloc_name(found->flags));
3699         if (ret) {
3700                 kfree(found);
3701                 return ret;
3702         }
3703
3704         *space_info = found;
3705         list_add_rcu(&found->list, &info->space_info);
3706         if (flags & BTRFS_BLOCK_GROUP_DATA)
3707                 info->data_sinfo = found;
3708
3709         return ret;
3710 }
3711
3712 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3713 {
3714         u64 extra_flags = chunk_to_extended(flags) &
3715                                 BTRFS_EXTENDED_PROFILE_MASK;
3716
3717         write_seqlock(&fs_info->profiles_lock);
3718         if (flags & BTRFS_BLOCK_GROUP_DATA)
3719                 fs_info->avail_data_alloc_bits |= extra_flags;
3720         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3721                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3722         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3723                 fs_info->avail_system_alloc_bits |= extra_flags;
3724         write_sequnlock(&fs_info->profiles_lock);
3725 }
3726
3727 /*
3728  * returns target flags in extended format or 0 if restripe for this
3729  * chunk_type is not in progress
3730  *
3731  * should be called with either volume_mutex or balance_lock held
3732  */
3733 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3734 {
3735         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3736         u64 target = 0;
3737
3738         if (!bctl)
3739                 return 0;
3740
3741         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3742             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3743                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3744         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3745                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3746                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3747         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3748                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3749                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3750         }
3751
3752         return target;
3753 }
3754
3755 /*
3756  * @flags: available profiles in extended format (see ctree.h)
3757  *
3758  * Returns reduced profile in chunk format.  If profile changing is in
3759  * progress (either running or paused) picks the target profile (if it's
3760  * already available), otherwise falls back to plain reducing.
3761  */
3762 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3763 {
3764         u64 num_devices = root->fs_info->fs_devices->rw_devices;
3765         u64 target;
3766         u64 tmp;
3767
3768         /*
3769          * see if restripe for this chunk_type is in progress, if so
3770          * try to reduce to the target profile
3771          */
3772         spin_lock(&root->fs_info->balance_lock);
3773         target = get_restripe_target(root->fs_info, flags);
3774         if (target) {
3775                 /* pick target profile only if it's already available */
3776                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3777                         spin_unlock(&root->fs_info->balance_lock);
3778                         return extended_to_chunk(target);
3779                 }
3780         }
3781         spin_unlock(&root->fs_info->balance_lock);
3782
3783         /* First, mask out the RAID levels which aren't possible */
3784         if (num_devices == 1)
3785                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3786                            BTRFS_BLOCK_GROUP_RAID5);
3787         if (num_devices < 3)
3788                 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3789         if (num_devices < 4)
3790                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3791
3792         tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3793                        BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3794                        BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3795         flags &= ~tmp;
3796
3797         if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3798                 tmp = BTRFS_BLOCK_GROUP_RAID6;
3799         else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3800                 tmp = BTRFS_BLOCK_GROUP_RAID5;
3801         else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3802                 tmp = BTRFS_BLOCK_GROUP_RAID10;
3803         else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3804                 tmp = BTRFS_BLOCK_GROUP_RAID1;
3805         else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3806                 tmp = BTRFS_BLOCK_GROUP_RAID0;
3807
3808         return extended_to_chunk(flags | tmp);
3809 }
3810
3811 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3812 {
3813         unsigned seq;
3814         u64 flags;
3815
3816         do {
3817                 flags = orig_flags;
3818                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3819
3820                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3821                         flags |= root->fs_info->avail_data_alloc_bits;
3822                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3823                         flags |= root->fs_info->avail_system_alloc_bits;
3824                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3825                         flags |= root->fs_info->avail_metadata_alloc_bits;
3826         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3827
3828         return btrfs_reduce_alloc_profile(root, flags);
3829 }
3830
3831 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3832 {
3833         u64 flags;
3834         u64 ret;
3835
3836         if (data)
3837                 flags = BTRFS_BLOCK_GROUP_DATA;
3838         else if (root == root->fs_info->chunk_root)
3839                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3840         else
3841                 flags = BTRFS_BLOCK_GROUP_METADATA;
3842
3843         ret = get_alloc_profile(root, flags);
3844         return ret;
3845 }
3846
3847 /*
3848  * This will check the space that the inode allocates from to make sure we have
3849  * enough space for bytes.
3850  */
3851 int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
3852 {
3853         struct btrfs_space_info *data_sinfo;
3854         struct btrfs_root *root = BTRFS_I(inode)->root;
3855         struct btrfs_fs_info *fs_info = root->fs_info;
3856         u64 used;
3857         int ret = 0;
3858         int need_commit = 2;
3859         int have_pinned_space;
3860
3861         /* make sure bytes are sectorsize aligned */
3862         bytes = ALIGN(bytes, root->sectorsize);
3863
3864         if (btrfs_is_free_space_inode(inode)) {
3865                 need_commit = 0;
3866                 ASSERT(current->journal_info);
3867         }
3868
3869         data_sinfo = fs_info->data_sinfo;
3870         if (!data_sinfo)
3871                 goto alloc;
3872
3873 again:
3874         /* make sure we have enough space to handle the data first */
3875         spin_lock(&data_sinfo->lock);
3876         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3877                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3878                 data_sinfo->bytes_may_use;
3879
3880         if (used + bytes > data_sinfo->total_bytes) {
3881                 struct btrfs_trans_handle *trans;
3882
3883                 /*
3884                  * if we don't have enough free bytes in this space then we need
3885                  * to alloc a new chunk.
3886                  */
3887                 if (!data_sinfo->full) {
3888                         u64 alloc_target;
3889
3890                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3891                         spin_unlock(&data_sinfo->lock);
3892 alloc:
3893                         alloc_target = btrfs_get_alloc_profile(root, 1);
3894                         /*
3895                          * It is ugly that we don't call nolock join
3896                          * transaction for the free space inode case here.
3897                          * But it is safe because we only do the data space
3898                          * reservation for the free space cache in the
3899                          * transaction context, the common join transaction
3900                          * just increase the counter of the current transaction
3901                          * handler, doesn't try to acquire the trans_lock of
3902                          * the fs.
3903                          */
3904                         trans = btrfs_join_transaction(root);
3905                         if (IS_ERR(trans))
3906                                 return PTR_ERR(trans);
3907
3908                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3909                                              alloc_target,
3910                                              CHUNK_ALLOC_NO_FORCE);
3911                         btrfs_end_transaction(trans, root);
3912                         if (ret < 0) {
3913                                 if (ret != -ENOSPC)
3914                                         return ret;
3915                                 else {
3916                                         have_pinned_space = 1;
3917                                         goto commit_trans;
3918                                 }
3919                         }
3920
3921                         if (!data_sinfo)
3922                                 data_sinfo = fs_info->data_sinfo;
3923
3924                         goto again;
3925                 }
3926
3927                 /*
3928                  * If we don't have enough pinned space to deal with this
3929                  * allocation, and no removed chunk in current transaction,
3930                  * don't bother committing the transaction.
3931                  */
3932                 have_pinned_space = percpu_counter_compare(
3933                         &data_sinfo->total_bytes_pinned,
3934                         used + bytes - data_sinfo->total_bytes);
3935                 spin_unlock(&data_sinfo->lock);
3936
3937                 /* commit the current transaction and try again */
3938 commit_trans:
3939                 if (need_commit &&
3940                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
3941                         need_commit--;
3942
3943                         if (need_commit > 0)
3944                                 btrfs_wait_ordered_roots(fs_info, -1);
3945
3946                         trans = btrfs_join_transaction(root);
3947                         if (IS_ERR(trans))
3948                                 return PTR_ERR(trans);
3949                         if (have_pinned_space >= 0 ||
3950                             trans->transaction->have_free_bgs ||
3951                             need_commit > 0) {
3952                                 ret = btrfs_commit_transaction(trans, root);
3953                                 if (ret)
3954                                         return ret;
3955                                 /*
3956                                  * make sure that all running delayed iput are
3957                                  * done
3958                                  */
3959                                 down_write(&root->fs_info->delayed_iput_sem);
3960                                 up_write(&root->fs_info->delayed_iput_sem);
3961                                 goto again;
3962                         } else {
3963                                 btrfs_end_transaction(trans, root);
3964                         }
3965                 }
3966
3967                 trace_btrfs_space_reservation(root->fs_info,
3968                                               "space_info:enospc",
3969                                               data_sinfo->flags, bytes, 1);
3970                 return -ENOSPC;
3971         }
3972         ret = btrfs_qgroup_reserve(root, write_bytes);
3973         if (ret)
3974                 goto out;
3975         data_sinfo->bytes_may_use += bytes;
3976         trace_btrfs_space_reservation(root->fs_info, "space_info",
3977                                       data_sinfo->flags, bytes, 1);
3978 out:
3979         spin_unlock(&data_sinfo->lock);
3980
3981         return ret;
3982 }
3983
3984 /*
3985  * Called if we need to clear a data reservation for this inode.
3986  */
3987 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3988 {
3989         struct btrfs_root *root = BTRFS_I(inode)->root;
3990         struct btrfs_space_info *data_sinfo;
3991
3992         /* make sure bytes are sectorsize aligned */
3993         bytes = ALIGN(bytes, root->sectorsize);
3994
3995         data_sinfo = root->fs_info->data_sinfo;
3996         spin_lock(&data_sinfo->lock);
3997         WARN_ON(data_sinfo->bytes_may_use < bytes);
3998         data_sinfo->bytes_may_use -= bytes;
3999         trace_btrfs_space_reservation(root->fs_info, "space_info",
4000                                       data_sinfo->flags, bytes, 0);
4001         spin_unlock(&data_sinfo->lock);
4002 }
4003
4004 static void force_metadata_allocation(struct btrfs_fs_info *info)
4005 {
4006         struct list_head *head = &info->space_info;
4007         struct btrfs_space_info *found;
4008
4009         rcu_read_lock();
4010         list_for_each_entry_rcu(found, head, list) {
4011                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4012                         found->force_alloc = CHUNK_ALLOC_FORCE;
4013         }
4014         rcu_read_unlock();
4015 }
4016
4017 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4018 {
4019         return (global->size << 1);
4020 }
4021
4022 static int should_alloc_chunk(struct btrfs_root *root,
4023                               struct btrfs_space_info *sinfo, int force)
4024 {
4025         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4026         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4027         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4028         u64 thresh;
4029
4030         if (force == CHUNK_ALLOC_FORCE)
4031                 return 1;
4032
4033         /*
4034          * We need to take into account the global rsv because for all intents
4035          * and purposes it's used space.  Don't worry about locking the
4036          * global_rsv, it doesn't change except when the transaction commits.
4037          */
4038         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4039                 num_allocated += calc_global_rsv_need_space(global_rsv);
4040
4041         /*
4042          * in limited mode, we want to have some free space up to
4043          * about 1% of the FS size.
4044          */
4045         if (force == CHUNK_ALLOC_LIMITED) {
4046                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4047                 thresh = max_t(u64, 64 * 1024 * 1024,
4048                                div_factor_fine(thresh, 1));
4049
4050                 if (num_bytes - num_allocated < thresh)
4051                         return 1;
4052         }
4053
4054         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
4055                 return 0;
4056         return 1;
4057 }
4058
4059 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4060 {
4061         u64 num_dev;
4062
4063         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4064                     BTRFS_BLOCK_GROUP_RAID0 |
4065                     BTRFS_BLOCK_GROUP_RAID5 |
4066                     BTRFS_BLOCK_GROUP_RAID6))
4067                 num_dev = root->fs_info->fs_devices->rw_devices;
4068         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4069                 num_dev = 2;
4070         else
4071                 num_dev = 1;    /* DUP or single */
4072
4073         return num_dev;
4074 }
4075
4076 /*
4077  * If @is_allocation is true, reserve space in the system space info necessary
4078  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4079  * removing a chunk.
4080  */
4081 void check_system_chunk(struct btrfs_trans_handle *trans,
4082                         struct btrfs_root *root,
4083                         u64 type)
4084 {
4085         struct btrfs_space_info *info;
4086         u64 left;
4087         u64 thresh;
4088         int ret = 0;
4089         u64 num_devs;
4090
4091         /*
4092          * Needed because we can end up allocating a system chunk and for an
4093          * atomic and race free space reservation in the chunk block reserve.
4094          */
4095         ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4096
4097         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4098         spin_lock(&info->lock);
4099         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4100                 info->bytes_reserved - info->bytes_readonly -
4101                 info->bytes_may_use;
4102         spin_unlock(&info->lock);
4103
4104         num_devs = get_profile_num_devs(root, type);
4105
4106         /* num_devs device items to update and 1 chunk item to add or remove */
4107         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4108                 btrfs_calc_trans_metadata_size(root, 1);
4109
4110         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
4111                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4112                         left, thresh, type);
4113                 dump_space_info(info, 0, 0);
4114         }
4115
4116         if (left < thresh) {
4117                 u64 flags;
4118
4119                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4120                 /*
4121                  * Ignore failure to create system chunk. We might end up not
4122                  * needing it, as we might not need to COW all nodes/leafs from
4123                  * the paths we visit in the chunk tree (they were already COWed
4124                  * or created in the current transaction for example).
4125                  */
4126                 ret = btrfs_alloc_chunk(trans, root, flags);
4127         }
4128
4129         if (!ret) {
4130                 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4131                                           &root->fs_info->chunk_block_rsv,
4132                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4133                 if (!ret)
4134                         trans->chunk_bytes_reserved += thresh;
4135         }
4136 }
4137
4138 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4139                           struct btrfs_root *extent_root, u64 flags, int force)
4140 {
4141         struct btrfs_space_info *space_info;
4142         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4143         int wait_for_alloc = 0;
4144         int ret = 0;
4145
4146         /* Don't re-enter if we're already allocating a chunk */
4147         if (trans->allocating_chunk)
4148                 return -ENOSPC;
4149
4150         space_info = __find_space_info(extent_root->fs_info, flags);
4151         if (!space_info) {
4152                 ret = update_space_info(extent_root->fs_info, flags,
4153                                         0, 0, &space_info);
4154                 BUG_ON(ret); /* -ENOMEM */
4155         }
4156         BUG_ON(!space_info); /* Logic error */
4157
4158 again:
4159         spin_lock(&space_info->lock);
4160         if (force < space_info->force_alloc)
4161                 force = space_info->force_alloc;
4162         if (space_info->full) {
4163                 if (should_alloc_chunk(extent_root, space_info, force))
4164                         ret = -ENOSPC;
4165                 else
4166                         ret = 0;
4167                 spin_unlock(&space_info->lock);
4168                 return ret;
4169         }
4170
4171         if (!should_alloc_chunk(extent_root, space_info, force)) {
4172                 spin_unlock(&space_info->lock);
4173                 return 0;
4174         } else if (space_info->chunk_alloc) {
4175                 wait_for_alloc = 1;
4176         } else {
4177                 space_info->chunk_alloc = 1;
4178         }
4179
4180         spin_unlock(&space_info->lock);
4181
4182         mutex_lock(&fs_info->chunk_mutex);
4183
4184         /*
4185          * The chunk_mutex is held throughout the entirety of a chunk
4186          * allocation, so once we've acquired the chunk_mutex we know that the
4187          * other guy is done and we need to recheck and see if we should
4188          * allocate.
4189          */
4190         if (wait_for_alloc) {
4191                 mutex_unlock(&fs_info->chunk_mutex);
4192                 wait_for_alloc = 0;
4193                 goto again;
4194         }
4195
4196         trans->allocating_chunk = true;
4197
4198         /*
4199          * If we have mixed data/metadata chunks we want to make sure we keep
4200          * allocating mixed chunks instead of individual chunks.
4201          */
4202         if (btrfs_mixed_space_info(space_info))
4203                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4204
4205         /*
4206          * if we're doing a data chunk, go ahead and make sure that
4207          * we keep a reasonable number of metadata chunks allocated in the
4208          * FS as well.
4209          */
4210         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4211                 fs_info->data_chunk_allocations++;
4212                 if (!(fs_info->data_chunk_allocations %
4213                       fs_info->metadata_ratio))
4214                         force_metadata_allocation(fs_info);
4215         }
4216
4217         /*
4218          * Check if we have enough space in SYSTEM chunk because we may need
4219          * to update devices.
4220          */
4221         check_system_chunk(trans, extent_root, flags);
4222
4223         ret = btrfs_alloc_chunk(trans, extent_root, flags);
4224         trans->allocating_chunk = false;
4225
4226         spin_lock(&space_info->lock);
4227         if (ret < 0 && ret != -ENOSPC)
4228                 goto out;
4229         if (ret)
4230                 space_info->full = 1;
4231         else
4232                 ret = 1;
4233
4234         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4235 out:
4236         space_info->chunk_alloc = 0;
4237         spin_unlock(&space_info->lock);
4238         mutex_unlock(&fs_info->chunk_mutex);
4239         /*
4240          * When we allocate a new chunk we reserve space in the chunk block
4241          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4242          * add new nodes/leafs to it if we end up needing to do it when
4243          * inserting the chunk item and updating device items as part of the
4244          * second phase of chunk allocation, performed by
4245          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4246          * large number of new block groups to create in our transaction
4247          * handle's new_bgs list to avoid exhausting the chunk block reserve
4248          * in extreme cases - like having a single transaction create many new
4249          * block groups when starting to write out the free space caches of all
4250          * the block groups that were made dirty during the lifetime of the
4251          * transaction.
4252          */
4253         if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
4254                 btrfs_create_pending_block_groups(trans, trans->root);
4255                 btrfs_trans_release_chunk_metadata(trans);
4256         }
4257         return ret;
4258 }
4259
4260 static int can_overcommit(struct btrfs_root *root,
4261                           struct btrfs_space_info *space_info, u64 bytes,
4262                           enum btrfs_reserve_flush_enum flush)
4263 {
4264         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4265         u64 profile = btrfs_get_alloc_profile(root, 0);
4266         u64 space_size;
4267         u64 avail;
4268         u64 used;
4269
4270         used = space_info->bytes_used + space_info->bytes_reserved +
4271                 space_info->bytes_pinned + space_info->bytes_readonly;
4272
4273         /*
4274          * We only want to allow over committing if we have lots of actual space
4275          * free, but if we don't have enough space to handle the global reserve
4276          * space then we could end up having a real enospc problem when trying
4277          * to allocate a chunk or some other such important allocation.
4278          */
4279         spin_lock(&global_rsv->lock);
4280         space_size = calc_global_rsv_need_space(global_rsv);
4281         spin_unlock(&global_rsv->lock);
4282         if (used + space_size >= space_info->total_bytes)
4283                 return 0;
4284
4285         used += space_info->bytes_may_use;
4286
4287         spin_lock(&root->fs_info->free_chunk_lock);
4288         avail = root->fs_info->free_chunk_space;
4289         spin_unlock(&root->fs_info->free_chunk_lock);
4290
4291         /*
4292          * If we have dup, raid1 or raid10 then only half of the free
4293          * space is actually useable.  For raid56, the space info used
4294          * doesn't include the parity drive, so we don't have to
4295          * change the math
4296          */
4297         if (profile & (BTRFS_BLOCK_GROUP_DUP |
4298                        BTRFS_BLOCK_GROUP_RAID1 |
4299                        BTRFS_BLOCK_GROUP_RAID10))
4300                 avail >>= 1;
4301
4302         /*
4303          * If we aren't flushing all things, let us overcommit up to
4304          * 1/2th of the space. If we can flush, don't let us overcommit
4305          * too much, let it overcommit up to 1/8 of the space.
4306          */
4307         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4308                 avail >>= 3;
4309         else
4310                 avail >>= 1;
4311
4312         if (used + bytes < space_info->total_bytes + avail)
4313                 return 1;
4314         return 0;
4315 }
4316
4317 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4318                                          unsigned long nr_pages, int nr_items)
4319 {
4320         struct super_block *sb = root->fs_info->sb;
4321
4322         if (down_read_trylock(&sb->s_umount)) {
4323                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4324                 up_read(&sb->s_umount);
4325         } else {
4326                 /*
4327                  * We needn't worry the filesystem going from r/w to r/o though
4328                  * we don't acquire ->s_umount mutex, because the filesystem
4329                  * should guarantee the delalloc inodes list be empty after
4330                  * the filesystem is readonly(all dirty pages are written to
4331                  * the disk).
4332                  */
4333                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4334                 if (!current->journal_info)
4335                         btrfs_wait_ordered_roots(root->fs_info, nr_items);
4336         }
4337 }
4338
4339 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4340 {
4341         u64 bytes;
4342         int nr;
4343
4344         bytes = btrfs_calc_trans_metadata_size(root, 1);
4345         nr = (int)div64_u64(to_reclaim, bytes);
4346         if (!nr)
4347                 nr = 1;
4348         return nr;
4349 }
4350
4351 #define EXTENT_SIZE_PER_ITEM    (256 * 1024)
4352
4353 /*
4354  * shrink metadata reservation for delalloc
4355  */
4356 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4357                             bool wait_ordered)
4358 {
4359         struct btrfs_block_rsv *block_rsv;
4360         struct btrfs_space_info *space_info;
4361         struct btrfs_trans_handle *trans;
4362         u64 delalloc_bytes;
4363         u64 max_reclaim;
4364         long time_left;
4365         unsigned long nr_pages;
4366         int loops;
4367         int items;
4368         enum btrfs_reserve_flush_enum flush;
4369
4370         /* Calc the number of the pages we need flush for space reservation */
4371         items = calc_reclaim_items_nr(root, to_reclaim);
4372         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4373
4374         trans = (struct btrfs_trans_handle *)current->journal_info;
4375         block_rsv = &root->fs_info->delalloc_block_rsv;
4376         space_info = block_rsv->space_info;
4377
4378         delalloc_bytes = percpu_counter_sum_positive(
4379                                                 &root->fs_info->delalloc_bytes);
4380         if (delalloc_bytes == 0) {
4381                 if (trans)
4382                         return;
4383                 if (wait_ordered)
4384                         btrfs_wait_ordered_roots(root->fs_info, items);
4385                 return;
4386         }
4387
4388         loops = 0;
4389         while (delalloc_bytes && loops < 3) {
4390                 max_reclaim = min(delalloc_bytes, to_reclaim);
4391                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4392                 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4393                 /*
4394                  * We need to wait for the async pages to actually start before
4395                  * we do anything.
4396                  */
4397                 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4398                 if (!max_reclaim)
4399                         goto skip_async;
4400
4401                 if (max_reclaim <= nr_pages)
4402                         max_reclaim = 0;
4403                 else
4404                         max_reclaim -= nr_pages;
4405
4406                 wait_event(root->fs_info->async_submit_wait,
4407                            atomic_read(&root->fs_info->async_delalloc_pages) <=
4408                            (int)max_reclaim);
4409 skip_async:
4410                 if (!trans)
4411                         flush = BTRFS_RESERVE_FLUSH_ALL;
4412                 else
4413                         flush = BTRFS_RESERVE_NO_FLUSH;
4414                 spin_lock(&space_info->lock);
4415                 if (can_overcommit(root, space_info, orig, flush)) {
4416                         spin_unlock(&space_info->lock);
4417                         break;
4418                 }
4419                 spin_unlock(&space_info->lock);
4420
4421                 loops++;
4422                 if (wait_ordered && !trans) {
4423                         btrfs_wait_ordered_roots(root->fs_info, items);
4424                 } else {
4425                         time_left = schedule_timeout_killable(1);
4426                         if (time_left)
4427                                 break;
4428                 }
4429                 delalloc_bytes = percpu_counter_sum_positive(
4430                                                 &root->fs_info->delalloc_bytes);
4431         }
4432 }
4433
4434 /**
4435  * maybe_commit_transaction - possibly commit the transaction if its ok to
4436  * @root - the root we're allocating for
4437  * @bytes - the number of bytes we want to reserve
4438  * @force - force the commit
4439  *
4440  * This will check to make sure that committing the transaction will actually
4441  * get us somewhere and then commit the transaction if it does.  Otherwise it
4442  * will return -ENOSPC.
4443  */
4444 static int may_commit_transaction(struct btrfs_root *root,
4445                                   struct btrfs_space_info *space_info,
4446                                   u64 bytes, int force)
4447 {
4448         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4449         struct btrfs_trans_handle *trans;
4450
4451         trans = (struct btrfs_trans_handle *)current->journal_info;
4452         if (trans)
4453                 return -EAGAIN;
4454
4455         if (force)
4456                 goto commit;
4457
4458         /* See if there is enough pinned space to make this reservation */
4459         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4460                                    bytes) >= 0)
4461                 goto commit;
4462
4463         /*
4464          * See if there is some space in the delayed insertion reservation for
4465          * this reservation.
4466          */
4467         if (space_info != delayed_rsv->space_info)
4468                 return -ENOSPC;
4469
4470         spin_lock(&delayed_rsv->lock);
4471         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4472                                    bytes - delayed_rsv->size) >= 0) {
4473                 spin_unlock(&delayed_rsv->lock);
4474                 return -ENOSPC;
4475         }
4476         spin_unlock(&delayed_rsv->lock);
4477
4478 commit:
4479         trans = btrfs_join_transaction(root);
4480         if (IS_ERR(trans))
4481                 return -ENOSPC;
4482
4483         return btrfs_commit_transaction(trans, root);
4484 }
4485
4486 enum flush_state {
4487         FLUSH_DELAYED_ITEMS_NR  =       1,
4488         FLUSH_DELAYED_ITEMS     =       2,
4489         FLUSH_DELALLOC          =       3,
4490         FLUSH_DELALLOC_WAIT     =       4,
4491         ALLOC_CHUNK             =       5,
4492         COMMIT_TRANS            =       6,
4493 };
4494
4495 static int flush_space(struct btrfs_root *root,
4496                        struct btrfs_space_info *space_info, u64 num_bytes,
4497                        u64 orig_bytes, int state)
4498 {
4499         struct btrfs_trans_handle *trans;
4500         int nr;
4501         int ret = 0;
4502
4503         switch (state) {
4504         case FLUSH_DELAYED_ITEMS_NR:
4505         case FLUSH_DELAYED_ITEMS:
4506                 if (state == FLUSH_DELAYED_ITEMS_NR)
4507                         nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4508                 else
4509                         nr = -1;
4510
4511                 trans = btrfs_join_transaction(root);
4512                 if (IS_ERR(trans)) {
4513                         ret = PTR_ERR(trans);
4514                         break;
4515                 }
4516                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4517                 btrfs_end_transaction(trans, root);
4518                 break;
4519         case FLUSH_DELALLOC:
4520         case FLUSH_DELALLOC_WAIT:
4521                 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4522                                 state == FLUSH_DELALLOC_WAIT);
4523                 break;
4524         case ALLOC_CHUNK:
4525                 trans = btrfs_join_transaction(root);
4526                 if (IS_ERR(trans)) {
4527                         ret = PTR_ERR(trans);
4528                         break;
4529                 }
4530                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4531                                      btrfs_get_alloc_profile(root, 0),
4532                                      CHUNK_ALLOC_NO_FORCE);
4533                 btrfs_end_transaction(trans, root);
4534                 if (ret == -ENOSPC)
4535                         ret = 0;
4536                 break;
4537         case COMMIT_TRANS:
4538                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4539                 break;
4540         default:
4541                 ret = -ENOSPC;
4542                 break;
4543         }
4544
4545         return ret;
4546 }
4547
4548 static inline u64
4549 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4550                                  struct btrfs_space_info *space_info)
4551 {
4552         u64 used;
4553         u64 expected;
4554         u64 to_reclaim;
4555
4556         to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4557                                 16 * 1024 * 1024);
4558         spin_lock(&space_info->lock);
4559         if (can_overcommit(root, space_info, to_reclaim,
4560                            BTRFS_RESERVE_FLUSH_ALL)) {
4561                 to_reclaim = 0;
4562                 goto out;
4563         }
4564
4565         used = space_info->bytes_used + space_info->bytes_reserved +
4566                space_info->bytes_pinned + space_info->bytes_readonly +
4567                space_info->bytes_may_use;
4568         if (can_overcommit(root, space_info, 1024 * 1024,
4569                            BTRFS_RESERVE_FLUSH_ALL))
4570                 expected = div_factor_fine(space_info->total_bytes, 95);
4571         else
4572                 expected = div_factor_fine(space_info->total_bytes, 90);
4573
4574         if (used > expected)
4575                 to_reclaim = used - expected;
4576         else
4577                 to_reclaim = 0;
4578         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4579                                      space_info->bytes_reserved);
4580 out:
4581         spin_unlock(&space_info->lock);
4582
4583         return to_reclaim;
4584 }
4585
4586 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4587                                         struct btrfs_fs_info *fs_info, u64 used)
4588 {
4589         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4590
4591         /* If we're just plain full then async reclaim just slows us down. */
4592         if (space_info->bytes_used >= thresh)
4593                 return 0;
4594
4595         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4596                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4597 }
4598
4599 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4600                                        struct btrfs_fs_info *fs_info,
4601                                        int flush_state)
4602 {
4603         u64 used;
4604
4605         spin_lock(&space_info->lock);
4606         /*
4607          * We run out of space and have not got any free space via flush_space,
4608          * so don't bother doing async reclaim.
4609          */
4610         if (flush_state > COMMIT_TRANS && space_info->full) {
4611                 spin_unlock(&space_info->lock);
4612                 return 0;
4613         }
4614
4615         used = space_info->bytes_used + space_info->bytes_reserved +
4616                space_info->bytes_pinned + space_info->bytes_readonly +
4617                space_info->bytes_may_use;
4618         if (need_do_async_reclaim(space_info, fs_info, used)) {
4619                 spin_unlock(&space_info->lock);
4620                 return 1;
4621         }
4622         spin_unlock(&space_info->lock);
4623
4624         return 0;
4625 }
4626
4627 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4628 {
4629         struct btrfs_fs_info *fs_info;
4630         struct btrfs_space_info *space_info;
4631         u64 to_reclaim;
4632         int flush_state;
4633
4634         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4635         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4636
4637         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4638                                                       space_info);
4639         if (!to_reclaim)
4640                 return;
4641
4642         flush_state = FLUSH_DELAYED_ITEMS_NR;
4643         do {
4644                 flush_space(fs_info->fs_root, space_info, to_reclaim,
4645                             to_reclaim, flush_state);
4646                 flush_state++;
4647                 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4648                                                  flush_state))
4649                         return;
4650         } while (flush_state < COMMIT_TRANS);
4651 }
4652
4653 void btrfs_init_async_reclaim_work(struct work_struct *work)
4654 {
4655         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4656 }
4657
4658 /**
4659  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4660  * @root - the root we're allocating for
4661  * @block_rsv - the block_rsv we're allocating for
4662  * @orig_bytes - the number of bytes we want
4663  * @flush - whether or not we can flush to make our reservation
4664  *
4665  * This will reserve orgi_bytes number of bytes from the space info associated
4666  * with the block_rsv.  If there is not enough space it will make an attempt to
4667  * flush out space to make room.  It will do this by flushing delalloc if
4668  * possible or committing the transaction.  If flush is 0 then no attempts to
4669  * regain reservations will be made and this will fail if there is not enough
4670  * space already.
4671  */
4672 static int reserve_metadata_bytes(struct btrfs_root *root,
4673                                   struct btrfs_block_rsv *block_rsv,
4674                                   u64 orig_bytes,
4675                                   enum btrfs_reserve_flush_enum flush)
4676 {
4677         struct btrfs_space_info *space_info = block_rsv->space_info;
4678         u64 used;
4679         u64 num_bytes = orig_bytes;
4680         int flush_state = FLUSH_DELAYED_ITEMS_NR;
4681         int ret = 0;
4682         bool flushing = false;
4683
4684 again:
4685         ret = 0;
4686         spin_lock(&space_info->lock);
4687         /*
4688          * We only want to wait if somebody other than us is flushing and we
4689          * are actually allowed to flush all things.
4690          */
4691         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4692                space_info->flush) {
4693                 spin_unlock(&space_info->lock);
4694                 /*
4695                  * If we have a trans handle we can't wait because the flusher
4696                  * may have to commit the transaction, which would mean we would
4697                  * deadlock since we are waiting for the flusher to finish, but
4698                  * hold the current transaction open.
4699                  */
4700                 if (current->journal_info)
4701                         return -EAGAIN;
4702                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4703                 /* Must have been killed, return */
4704                 if (ret)
4705                         return -EINTR;
4706
4707                 spin_lock(&space_info->lock);
4708         }
4709
4710         ret = -ENOSPC;
4711         used = space_info->bytes_used + space_info->bytes_reserved +
4712                 space_info->bytes_pinned + space_info->bytes_readonly +
4713                 space_info->bytes_may_use;
4714
4715         /*
4716          * The idea here is that we've not already over-reserved the block group
4717          * then we can go ahead and save our reservation first and then start
4718          * flushing if we need to.  Otherwise if we've already overcommitted
4719          * lets start flushing stuff first and then come back and try to make
4720          * our reservation.
4721          */
4722         if (used <= space_info->total_bytes) {
4723                 if (used + orig_bytes <= space_info->total_bytes) {
4724                         space_info->bytes_may_use += orig_bytes;
4725                         trace_btrfs_space_reservation(root->fs_info,
4726                                 "space_info", space_info->flags, orig_bytes, 1);
4727                         ret = 0;
4728                 } else {
4729                         /*
4730                          * Ok set num_bytes to orig_bytes since we aren't
4731                          * overocmmitted, this way we only try and reclaim what
4732                          * we need.
4733                          */
4734                         num_bytes = orig_bytes;
4735                 }
4736         } else {
4737                 /*
4738                  * Ok we're over committed, set num_bytes to the overcommitted
4739                  * amount plus the amount of bytes that we need for this
4740                  * reservation.
4741                  */
4742                 num_bytes = used - space_info->total_bytes +
4743                         (orig_bytes * 2);
4744         }
4745
4746         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4747                 space_info->bytes_may_use += orig_bytes;
4748                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4749                                               space_info->flags, orig_bytes,
4750                                               1);
4751                 ret = 0;
4752         }
4753
4754         /*
4755          * Couldn't make our reservation, save our place so while we're trying
4756          * to reclaim space we can actually use it instead of somebody else
4757          * stealing it from us.
4758          *
4759          * We make the other tasks wait for the flush only when we can flush
4760          * all things.
4761          */
4762         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4763                 flushing = true;
4764                 space_info->flush = 1;
4765         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4766                 used += orig_bytes;
4767                 /*
4768                  * We will do the space reservation dance during log replay,
4769                  * which means we won't have fs_info->fs_root set, so don't do
4770                  * the async reclaim as we will panic.
4771                  */
4772                 if (!root->fs_info->log_root_recovering &&
4773                     need_do_async_reclaim(space_info, root->fs_info, used) &&
4774                     !work_busy(&root->fs_info->async_reclaim_work))
4775                         queue_work(system_unbound_wq,
4776                                    &root->fs_info->async_reclaim_work);
4777         }
4778         spin_unlock(&space_info->lock);
4779
4780         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4781                 goto out;
4782
4783         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4784                           flush_state);
4785         flush_state++;
4786
4787         /*
4788          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4789          * would happen. So skip delalloc flush.
4790          */
4791         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4792             (flush_state == FLUSH_DELALLOC ||
4793              flush_state == FLUSH_DELALLOC_WAIT))
4794                 flush_state = ALLOC_CHUNK;
4795
4796         if (!ret)
4797                 goto again;
4798         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4799                  flush_state < COMMIT_TRANS)
4800                 goto again;
4801         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4802                  flush_state <= COMMIT_TRANS)
4803                 goto again;
4804
4805 out:
4806         if (ret == -ENOSPC &&
4807             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4808                 struct btrfs_block_rsv *global_rsv =
4809                         &root->fs_info->global_block_rsv;
4810
4811                 if (block_rsv != global_rsv &&
4812                     !block_rsv_use_bytes(global_rsv, orig_bytes))
4813                         ret = 0;
4814         }
4815         if (ret == -ENOSPC)
4816                 trace_btrfs_space_reservation(root->fs_info,
4817                                               "space_info:enospc",
4818                                               space_info->flags, orig_bytes, 1);
4819         if (flushing) {
4820                 spin_lock(&space_info->lock);
4821                 space_info->flush = 0;
4822                 wake_up_all(&space_info->wait);
4823                 spin_unlock(&space_info->lock);
4824         }
4825         return ret;
4826 }
4827
4828 static struct btrfs_block_rsv *get_block_rsv(
4829                                         const struct btrfs_trans_handle *trans,
4830                                         const struct btrfs_root *root)
4831 {
4832         struct btrfs_block_rsv *block_rsv = NULL;
4833
4834         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4835                 block_rsv = trans->block_rsv;
4836
4837         if (root == root->fs_info->csum_root && trans->adding_csums)
4838                 block_rsv = trans->block_rsv;
4839
4840         if (root == root->fs_info->uuid_root)
4841                 block_rsv = trans->block_rsv;
4842
4843         if (!block_rsv)
4844                 block_rsv = root->block_rsv;
4845
4846         if (!block_rsv)
4847                 block_rsv = &root->fs_info->empty_block_rsv;
4848
4849         return block_rsv;
4850 }
4851
4852 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4853                                u64 num_bytes)
4854 {
4855         int ret = -ENOSPC;
4856         spin_lock(&block_rsv->lock);
4857         if (block_rsv->reserved >= num_bytes) {
4858                 block_rsv->reserved -= num_bytes;
4859                 if (block_rsv->reserved < block_rsv->size)
4860                         block_rsv->full = 0;
4861                 ret = 0;
4862         }
4863         spin_unlock(&block_rsv->lock);
4864         return ret;
4865 }
4866
4867 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4868                                 u64 num_bytes, int update_size)
4869 {
4870         spin_lock(&block_rsv->lock);
4871         block_rsv->reserved += num_bytes;
4872         if (update_size)
4873                 block_rsv->size += num_bytes;
4874         else if (block_rsv->reserved >= block_rsv->size)
4875                 block_rsv->full = 1;
4876         spin_unlock(&block_rsv->lock);
4877 }
4878
4879 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4880                              struct btrfs_block_rsv *dest, u64 num_bytes,
4881                              int min_factor)
4882 {
4883         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4884         u64 min_bytes;
4885
4886         if (global_rsv->space_info != dest->space_info)
4887                 return -ENOSPC;
4888
4889         spin_lock(&global_rsv->lock);
4890         min_bytes = div_factor(global_rsv->size, min_factor);
4891         if (global_rsv->reserved < min_bytes + num_bytes) {
4892                 spin_unlock(&global_rsv->lock);
4893                 return -ENOSPC;
4894         }
4895         global_rsv->reserved -= num_bytes;
4896         if (global_rsv->reserved < global_rsv->size)
4897                 global_rsv->full = 0;
4898         spin_unlock(&global_rsv->lock);
4899
4900         block_rsv_add_bytes(dest, num_bytes, 1);
4901         return 0;
4902 }
4903
4904 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4905                                     struct btrfs_block_rsv *block_rsv,
4906                                     struct btrfs_block_rsv *dest, u64 num_bytes)
4907 {
4908         struct btrfs_space_info *space_info = block_rsv->space_info;
4909
4910         spin_lock(&block_rsv->lock);
4911         if (num_bytes == (u64)-1)
4912                 num_bytes = block_rsv->size;
4913         block_rsv->size -= num_bytes;
4914         if (block_rsv->reserved >= block_rsv->size) {
4915                 num_bytes = block_rsv->reserved - block_rsv->size;
4916                 block_rsv->reserved = block_rsv->size;
4917                 block_rsv->full = 1;
4918         } else {
4919                 num_bytes = 0;
4920         }
4921         spin_unlock(&block_rsv->lock);
4922
4923         if (num_bytes > 0) {
4924                 if (dest) {
4925                         spin_lock(&dest->lock);
4926                         if (!dest->full) {
4927                                 u64 bytes_to_add;
4928
4929                                 bytes_to_add = dest->size - dest->reserved;
4930                                 bytes_to_add = min(num_bytes, bytes_to_add);
4931                                 dest->reserved += bytes_to_add;
4932                                 if (dest->reserved >= dest->size)
4933                                         dest->full = 1;
4934                                 num_bytes -= bytes_to_add;
4935                         }
4936                         spin_unlock(&dest->lock);
4937                 }
4938                 if (num_bytes) {
4939                         spin_lock(&space_info->lock);
4940                         space_info->bytes_may_use -= num_bytes;
4941                         trace_btrfs_space_reservation(fs_info, "space_info",
4942                                         space_info->flags, num_bytes, 0);
4943                         spin_unlock(&space_info->lock);
4944                 }
4945         }
4946 }
4947
4948 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4949                                    struct btrfs_block_rsv *dst, u64 num_bytes)
4950 {
4951         int ret;
4952
4953         ret = block_rsv_use_bytes(src, num_bytes);
4954         if (ret)
4955                 return ret;
4956
4957         block_rsv_add_bytes(dst, num_bytes, 1);
4958         return 0;
4959 }
4960
4961 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4962 {
4963         memset(rsv, 0, sizeof(*rsv));
4964         spin_lock_init(&rsv->lock);
4965         rsv->type = type;
4966 }
4967
4968 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4969                                               unsigned short type)
4970 {
4971         struct btrfs_block_rsv *block_rsv;
4972         struct btrfs_fs_info *fs_info = root->fs_info;
4973
4974         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4975         if (!block_rsv)
4976                 return NULL;
4977
4978         btrfs_init_block_rsv(block_rsv, type);
4979         block_rsv->space_info = __find_space_info(fs_info,
4980                                                   BTRFS_BLOCK_GROUP_METADATA);
4981         return block_rsv;
4982 }
4983
4984 void btrfs_free_block_rsv(struct btrfs_root *root,
4985                           struct btrfs_block_rsv *rsv)
4986 {
4987         if (!rsv)
4988                 return;
4989         btrfs_block_rsv_release(root, rsv, (u64)-1);
4990         kfree(rsv);
4991 }
4992
4993 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
4994 {
4995         kfree(rsv);
4996 }
4997
4998 int btrfs_block_rsv_add(struct btrfs_root *root,
4999                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5000                         enum btrfs_reserve_flush_enum flush)
5001 {
5002         int ret;
5003
5004         if (num_bytes == 0)
5005                 return 0;
5006
5007         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5008         if (!ret) {
5009                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5010                 return 0;
5011         }
5012
5013         return ret;
5014 }
5015
5016 int btrfs_block_rsv_check(struct btrfs_root *root,
5017                           struct btrfs_block_rsv *block_rsv, int min_factor)
5018 {
5019         u64 num_bytes = 0;
5020         int ret = -ENOSPC;
5021
5022         if (!block_rsv)
5023                 return 0;
5024
5025         spin_lock(&block_rsv->lock);
5026         num_bytes = div_factor(block_rsv->size, min_factor);
5027         if (block_rsv->reserved >= num_bytes)
5028                 ret = 0;
5029         spin_unlock(&block_rsv->lock);
5030
5031         return ret;
5032 }
5033
5034 int btrfs_block_rsv_refill(struct btrfs_root *root,
5035                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5036                            enum btrfs_reserve_flush_enum flush)
5037 {
5038         u64 num_bytes = 0;
5039         int ret = -ENOSPC;
5040
5041         if (!block_rsv)
5042                 return 0;
5043
5044         spin_lock(&block_rsv->lock);
5045         num_bytes = min_reserved;
5046         if (block_rsv->reserved >= num_bytes)
5047                 ret = 0;
5048         else
5049                 num_bytes -= block_rsv->reserved;
5050         spin_unlock(&block_rsv->lock);
5051
5052         if (!ret)
5053                 return 0;
5054
5055         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5056         if (!ret) {
5057                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5058                 return 0;
5059         }
5060
5061         return ret;
5062 }
5063
5064 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
5065                             struct btrfs_block_rsv *dst_rsv,
5066                             u64 num_bytes)
5067 {
5068         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5069 }
5070
5071 void btrfs_block_rsv_release(struct btrfs_root *root,
5072                              struct btrfs_block_rsv *block_rsv,
5073                              u64 num_bytes)
5074 {
5075         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5076         if (global_rsv == block_rsv ||
5077             block_rsv->space_info != global_rsv->space_info)
5078                 global_rsv = NULL;
5079         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5080                                 num_bytes);
5081 }
5082
5083 /*
5084  * helper to calculate size of global block reservation.
5085  * the desired value is sum of space used by extent tree,
5086  * checksum tree and root tree
5087  */
5088 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
5089 {
5090         struct btrfs_space_info *sinfo;
5091         u64 num_bytes;
5092         u64 meta_used;
5093         u64 data_used;
5094         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
5095
5096         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
5097         spin_lock(&sinfo->lock);
5098         data_used = sinfo->bytes_used;
5099         spin_unlock(&sinfo->lock);
5100
5101         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5102         spin_lock(&sinfo->lock);
5103         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
5104                 data_used = 0;
5105         meta_used = sinfo->bytes_used;
5106         spin_unlock(&sinfo->lock);
5107
5108         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
5109                     csum_size * 2;
5110         num_bytes += div_u64(data_used + meta_used, 50);
5111
5112         if (num_bytes * 3 > meta_used)
5113                 num_bytes = div_u64(meta_used, 3);
5114
5115         return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
5116 }
5117
5118 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5119 {
5120         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5121         struct btrfs_space_info *sinfo = block_rsv->space_info;
5122         u64 num_bytes;
5123
5124         num_bytes = calc_global_metadata_size(fs_info);
5125
5126         spin_lock(&sinfo->lock);
5127         spin_lock(&block_rsv->lock);
5128
5129         block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
5130
5131         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5132                     sinfo->bytes_reserved + sinfo->bytes_readonly +
5133                     sinfo->bytes_may_use;
5134
5135         if (sinfo->total_bytes > num_bytes) {
5136                 num_bytes = sinfo->total_bytes - num_bytes;
5137                 block_rsv->reserved += num_bytes;
5138                 sinfo->bytes_may_use += num_bytes;
5139                 trace_btrfs_space_reservation(fs_info, "space_info",
5140                                       sinfo->flags, num_bytes, 1);
5141         }
5142
5143         if (block_rsv->reserved >= block_rsv->size) {
5144                 num_bytes = block_rsv->reserved - block_rsv->size;
5145                 sinfo->bytes_may_use -= num_bytes;
5146                 trace_btrfs_space_reservation(fs_info, "space_info",
5147                                       sinfo->flags, num_bytes, 0);
5148                 block_rsv->reserved = block_rsv->size;
5149                 block_rsv->full = 1;
5150         }
5151
5152         spin_unlock(&block_rsv->lock);
5153         spin_unlock(&sinfo->lock);
5154 }
5155
5156 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5157 {
5158         struct btrfs_space_info *space_info;
5159
5160         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5161         fs_info->chunk_block_rsv.space_info = space_info;
5162
5163         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5164         fs_info->global_block_rsv.space_info = space_info;
5165         fs_info->delalloc_block_rsv.space_info = space_info;
5166         fs_info->trans_block_rsv.space_info = space_info;
5167         fs_info->empty_block_rsv.space_info = space_info;
5168         fs_info->delayed_block_rsv.space_info = space_info;
5169
5170         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5171         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5172         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5173         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5174         if (fs_info->quota_root)
5175                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5176         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5177
5178         update_global_block_rsv(fs_info);
5179 }
5180
5181 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5182 {
5183         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5184                                 (u64)-1);
5185         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5186         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5187         WARN_ON(fs_info->trans_block_rsv.size > 0);
5188         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5189         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5190         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5191         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5192         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5193 }
5194
5195 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5196                                   struct btrfs_root *root)
5197 {
5198         if (!trans->block_rsv)
5199                 return;
5200
5201         if (!trans->bytes_reserved)
5202                 return;
5203
5204         trace_btrfs_space_reservation(root->fs_info, "transaction",
5205                                       trans->transid, trans->bytes_reserved, 0);
5206         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5207         trans->bytes_reserved = 0;
5208 }
5209
5210 /*
5211  * To be called after all the new block groups attached to the transaction
5212  * handle have been created (btrfs_create_pending_block_groups()).
5213  */
5214 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5215 {
5216         struct btrfs_fs_info *fs_info = trans->root->fs_info;
5217
5218         if (!trans->chunk_bytes_reserved)
5219                 return;
5220
5221         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5222
5223         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5224                                 trans->chunk_bytes_reserved);
5225         trans->chunk_bytes_reserved = 0;
5226 }
5227
5228 /* Can only return 0 or -ENOSPC */
5229 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5230                                   struct inode *inode)
5231 {
5232         struct btrfs_root *root = BTRFS_I(inode)->root;
5233         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
5234         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5235
5236         /*
5237          * We need to hold space in order to delete our orphan item once we've
5238          * added it, so this takes the reservation so we can release it later
5239          * when we are truly done with the orphan item.
5240          */
5241         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5242         trace_btrfs_space_reservation(root->fs_info, "orphan",
5243                                       btrfs_ino(inode), num_bytes, 1);
5244         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5245 }
5246
5247 void btrfs_orphan_release_metadata(struct inode *inode)
5248 {
5249         struct btrfs_root *root = BTRFS_I(inode)->root;
5250         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5251         trace_btrfs_space_reservation(root->fs_info, "orphan",
5252                                       btrfs_ino(inode), num_bytes, 0);
5253         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5254 }
5255
5256 /*
5257  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5258  * root: the root of the parent directory
5259  * rsv: block reservation
5260  * items: the number of items that we need do reservation
5261  * qgroup_reserved: used to return the reserved size in qgroup
5262  *
5263  * This function is used to reserve the space for snapshot/subvolume
5264  * creation and deletion. Those operations are different with the
5265  * common file/directory operations, they change two fs/file trees
5266  * and root tree, the number of items that the qgroup reserves is
5267  * different with the free space reservation. So we can not use
5268  * the space reseravtion mechanism in start_transaction().
5269  */
5270 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5271                                      struct btrfs_block_rsv *rsv,
5272                                      int items,
5273                                      u64 *qgroup_reserved,
5274                                      bool use_global_rsv)
5275 {
5276         u64 num_bytes;
5277         int ret;
5278         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5279
5280         if (root->fs_info->quota_enabled) {
5281                 /* One for parent inode, two for dir entries */
5282                 num_bytes = 3 * root->nodesize;
5283                 ret = btrfs_qgroup_reserve(root, num_bytes);
5284                 if (ret)
5285                         return ret;
5286         } else {
5287                 num_bytes = 0;
5288         }
5289
5290         *qgroup_reserved = num_bytes;
5291
5292         num_bytes = btrfs_calc_trans_metadata_size(root, items);
5293         rsv->space_info = __find_space_info(root->fs_info,
5294                                             BTRFS_BLOCK_GROUP_METADATA);
5295         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5296                                   BTRFS_RESERVE_FLUSH_ALL);
5297
5298         if (ret == -ENOSPC && use_global_rsv)
5299                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
5300
5301         if (ret) {
5302                 if (*qgroup_reserved)
5303                         btrfs_qgroup_free(root, *qgroup_reserved);
5304         }
5305
5306         return ret;
5307 }
5308
5309 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5310                                       struct btrfs_block_rsv *rsv,
5311                                       u64 qgroup_reserved)
5312 {
5313         btrfs_block_rsv_release(root, rsv, (u64)-1);
5314 }
5315
5316 /**
5317  * drop_outstanding_extent - drop an outstanding extent
5318  * @inode: the inode we're dropping the extent for
5319  * @num_bytes: the number of bytes we're relaseing.
5320  *
5321  * This is called when we are freeing up an outstanding extent, either called
5322  * after an error or after an extent is written.  This will return the number of
5323  * reserved extents that need to be freed.  This must be called with
5324  * BTRFS_I(inode)->lock held.
5325  */
5326 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5327 {
5328         unsigned drop_inode_space = 0;
5329         unsigned dropped_extents = 0;
5330         unsigned num_extents = 0;
5331
5332         num_extents = (unsigned)div64_u64(num_bytes +
5333                                           BTRFS_MAX_EXTENT_SIZE - 1,
5334                                           BTRFS_MAX_EXTENT_SIZE);
5335         ASSERT(num_extents);
5336         ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5337         BTRFS_I(inode)->outstanding_extents -= num_extents;
5338
5339         if (BTRFS_I(inode)->outstanding_extents == 0 &&
5340             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5341                                &BTRFS_I(inode)->runtime_flags))
5342                 drop_inode_space = 1;
5343
5344         /*
5345          * If we have more or the same amount of outsanding extents than we have
5346          * reserved then we need to leave the reserved extents count alone.
5347          */
5348         if (BTRFS_I(inode)->outstanding_extents >=
5349             BTRFS_I(inode)->reserved_extents)
5350                 return drop_inode_space;
5351
5352         dropped_extents = BTRFS_I(inode)->reserved_extents -
5353                 BTRFS_I(inode)->outstanding_extents;
5354         BTRFS_I(inode)->reserved_extents -= dropped_extents;
5355         return dropped_extents + drop_inode_space;
5356 }
5357
5358 /**
5359  * calc_csum_metadata_size - return the amount of metada space that must be
5360  *      reserved/free'd for the given bytes.
5361  * @inode: the inode we're manipulating
5362  * @num_bytes: the number of bytes in question
5363  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5364  *
5365  * This adjusts the number of csum_bytes in the inode and then returns the
5366  * correct amount of metadata that must either be reserved or freed.  We
5367  * calculate how many checksums we can fit into one leaf and then divide the
5368  * number of bytes that will need to be checksumed by this value to figure out
5369  * how many checksums will be required.  If we are adding bytes then the number
5370  * may go up and we will return the number of additional bytes that must be
5371  * reserved.  If it is going down we will return the number of bytes that must
5372  * be freed.
5373  *
5374  * This must be called with BTRFS_I(inode)->lock held.
5375  */
5376 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5377                                    int reserve)
5378 {
5379         struct btrfs_root *root = BTRFS_I(inode)->root;
5380         u64 old_csums, num_csums;
5381
5382         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5383             BTRFS_I(inode)->csum_bytes == 0)
5384                 return 0;
5385
5386         old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5387         if (reserve)
5388                 BTRFS_I(inode)->csum_bytes += num_bytes;
5389         else
5390                 BTRFS_I(inode)->csum_bytes -= num_bytes;
5391         num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5392
5393         /* No change, no need to reserve more */
5394         if (old_csums == num_csums)
5395                 return 0;
5396
5397         if (reserve)
5398                 return btrfs_calc_trans_metadata_size(root,
5399                                                       num_csums - old_csums);
5400
5401         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5402 }
5403
5404 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5405 {
5406         struct btrfs_root *root = BTRFS_I(inode)->root;
5407         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5408         u64 to_reserve = 0;
5409         u64 csum_bytes;
5410         unsigned nr_extents = 0;
5411         int extra_reserve = 0;
5412         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5413         int ret = 0;
5414         bool delalloc_lock = true;
5415         u64 to_free = 0;
5416         unsigned dropped;
5417
5418         /* If we are a free space inode we need to not flush since we will be in
5419          * the middle of a transaction commit.  We also don't need the delalloc
5420          * mutex since we won't race with anybody.  We need this mostly to make
5421          * lockdep shut its filthy mouth.
5422          */
5423         if (btrfs_is_free_space_inode(inode)) {
5424                 flush = BTRFS_RESERVE_NO_FLUSH;
5425                 delalloc_lock = false;
5426         }
5427
5428         if (flush != BTRFS_RESERVE_NO_FLUSH &&
5429             btrfs_transaction_in_commit(root->fs_info))
5430                 schedule_timeout(1);
5431
5432         if (delalloc_lock)
5433                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5434
5435         num_bytes = ALIGN(num_bytes, root->sectorsize);
5436
5437         spin_lock(&BTRFS_I(inode)->lock);
5438         nr_extents = (unsigned)div64_u64(num_bytes +
5439                                          BTRFS_MAX_EXTENT_SIZE - 1,
5440                                          BTRFS_MAX_EXTENT_SIZE);
5441         BTRFS_I(inode)->outstanding_extents += nr_extents;
5442         nr_extents = 0;
5443
5444         if (BTRFS_I(inode)->outstanding_extents >
5445             BTRFS_I(inode)->reserved_extents)
5446                 nr_extents = BTRFS_I(inode)->outstanding_extents -
5447                         BTRFS_I(inode)->reserved_extents;
5448
5449         /*
5450          * Add an item to reserve for updating the inode when we complete the
5451          * delalloc io.
5452          */
5453         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5454                       &BTRFS_I(inode)->runtime_flags)) {
5455                 nr_extents++;
5456                 extra_reserve = 1;
5457         }
5458
5459         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
5460         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5461         csum_bytes = BTRFS_I(inode)->csum_bytes;
5462         spin_unlock(&BTRFS_I(inode)->lock);
5463
5464         if (root->fs_info->quota_enabled) {
5465                 ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
5466                 if (ret)
5467                         goto out_fail;
5468         }
5469
5470         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5471         if (unlikely(ret)) {
5472                 if (root->fs_info->quota_enabled)
5473                         btrfs_qgroup_free(root, nr_extents * root->nodesize);
5474                 goto out_fail;
5475         }
5476
5477         spin_lock(&BTRFS_I(inode)->lock);
5478         if (extra_reserve) {
5479                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5480                         &BTRFS_I(inode)->runtime_flags);
5481                 nr_extents--;
5482         }
5483         BTRFS_I(inode)->reserved_extents += nr_extents;
5484         spin_unlock(&BTRFS_I(inode)->lock);
5485
5486         if (delalloc_lock)
5487                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5488
5489         if (to_reserve)
5490                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5491                                               btrfs_ino(inode), to_reserve, 1);
5492         block_rsv_add_bytes(block_rsv, to_reserve, 1);
5493
5494         return 0;
5495
5496 out_fail:
5497         spin_lock(&BTRFS_I(inode)->lock);
5498         dropped = drop_outstanding_extent(inode, num_bytes);
5499         /*
5500          * If the inodes csum_bytes is the same as the original
5501          * csum_bytes then we know we haven't raced with any free()ers
5502          * so we can just reduce our inodes csum bytes and carry on.
5503          */
5504         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5505                 calc_csum_metadata_size(inode, num_bytes, 0);
5506         } else {
5507                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5508                 u64 bytes;
5509
5510                 /*
5511                  * This is tricky, but first we need to figure out how much we
5512                  * free'd from any free-ers that occured during this
5513                  * reservation, so we reset ->csum_bytes to the csum_bytes
5514                  * before we dropped our lock, and then call the free for the
5515                  * number of bytes that were freed while we were trying our
5516                  * reservation.
5517                  */
5518                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5519                 BTRFS_I(inode)->csum_bytes = csum_bytes;
5520                 to_free = calc_csum_metadata_size(inode, bytes, 0);
5521
5522
5523                 /*
5524                  * Now we need to see how much we would have freed had we not
5525                  * been making this reservation and our ->csum_bytes were not
5526                  * artificially inflated.
5527                  */
5528                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5529                 bytes = csum_bytes - orig_csum_bytes;
5530                 bytes = calc_csum_metadata_size(inode, bytes, 0);
5531
5532                 /*
5533                  * Now reset ->csum_bytes to what it should be.  If bytes is
5534                  * more than to_free then we would have free'd more space had we
5535                  * not had an artificially high ->csum_bytes, so we need to free
5536                  * the remainder.  If bytes is the same or less then we don't
5537                  * need to do anything, the other free-ers did the correct
5538                  * thing.
5539                  */
5540                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5541                 if (bytes > to_free)
5542                         to_free = bytes - to_free;
5543                 else
5544                         to_free = 0;
5545         }
5546         spin_unlock(&BTRFS_I(inode)->lock);
5547         if (dropped)
5548                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5549
5550         if (to_free) {
5551                 btrfs_block_rsv_release(root, block_rsv, to_free);
5552                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5553                                               btrfs_ino(inode), to_free, 0);
5554         }
5555         if (delalloc_lock)
5556                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5557         return ret;
5558 }
5559
5560 /**
5561  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5562  * @inode: the inode to release the reservation for
5563  * @num_bytes: the number of bytes we're releasing
5564  *
5565  * This will release the metadata reservation for an inode.  This can be called
5566  * once we complete IO for a given set of bytes to release their metadata
5567  * reservations.
5568  */
5569 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5570 {
5571         struct btrfs_root *root = BTRFS_I(inode)->root;
5572         u64 to_free = 0;
5573         unsigned dropped;
5574
5575         num_bytes = ALIGN(num_bytes, root->sectorsize);
5576         spin_lock(&BTRFS_I(inode)->lock);
5577         dropped = drop_outstanding_extent(inode, num_bytes);
5578
5579         if (num_bytes)
5580                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5581         spin_unlock(&BTRFS_I(inode)->lock);
5582         if (dropped > 0)
5583                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5584
5585         if (btrfs_test_is_dummy_root(root))
5586                 return;
5587
5588         trace_btrfs_space_reservation(root->fs_info, "delalloc",
5589                                       btrfs_ino(inode), to_free, 0);
5590
5591         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5592                                 to_free);
5593 }
5594
5595 /**
5596  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5597  * @inode: inode we're writing to
5598  * @num_bytes: the number of bytes we want to allocate
5599  *
5600  * This will do the following things
5601  *
5602  * o reserve space in the data space info for num_bytes
5603  * o reserve space in the metadata space info based on number of outstanding
5604  *   extents and how much csums will be needed
5605  * o add to the inodes ->delalloc_bytes
5606  * o add it to the fs_info's delalloc inodes list.
5607  *
5608  * This will return 0 for success and -ENOSPC if there is no space left.
5609  */
5610 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5611 {
5612         int ret;
5613
5614         ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
5615         if (ret)
5616                 return ret;
5617
5618         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5619         if (ret) {
5620                 btrfs_free_reserved_data_space(inode, num_bytes);
5621                 return ret;
5622         }
5623
5624         return 0;
5625 }
5626
5627 /**
5628  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5629  * @inode: inode we're releasing space for
5630  * @num_bytes: the number of bytes we want to free up
5631  *
5632  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5633  * called in the case that we don't need the metadata AND data reservations
5634  * anymore.  So if there is an error or we insert an inline extent.
5635  *
5636  * This function will release the metadata space that was not used and will
5637  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5638  * list if there are no delalloc bytes left.
5639  */
5640 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5641 {
5642         btrfs_delalloc_release_metadata(inode, num_bytes);
5643         btrfs_free_reserved_data_space(inode, num_bytes);
5644 }
5645
5646 static int update_block_group(struct btrfs_trans_handle *trans,
5647                               struct btrfs_root *root, u64 bytenr,
5648                               u64 num_bytes, int alloc)
5649 {
5650         struct btrfs_block_group_cache *cache = NULL;
5651         struct btrfs_fs_info *info = root->fs_info;
5652         u64 total = num_bytes;
5653         u64 old_val;
5654         u64 byte_in_group;
5655         int factor;
5656
5657         /* block accounting for super block */
5658         spin_lock(&info->delalloc_root_lock);
5659         old_val = btrfs_super_bytes_used(info->super_copy);
5660         if (alloc)
5661                 old_val += num_bytes;
5662         else
5663                 old_val -= num_bytes;
5664         btrfs_set_super_bytes_used(info->super_copy, old_val);
5665         spin_unlock(&info->delalloc_root_lock);
5666
5667         while (total) {
5668                 cache = btrfs_lookup_block_group(info, bytenr);
5669                 if (!cache)
5670                         return -ENOENT;
5671                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5672                                     BTRFS_BLOCK_GROUP_RAID1 |
5673                                     BTRFS_BLOCK_GROUP_RAID10))
5674                         factor = 2;
5675                 else
5676                         factor = 1;
5677                 /*
5678                  * If this block group has free space cache written out, we
5679                  * need to make sure to load it if we are removing space.  This
5680                  * is because we need the unpinning stage to actually add the
5681                  * space back to the block group, otherwise we will leak space.
5682                  */
5683                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5684                         cache_block_group(cache, 1);
5685
5686                 byte_in_group = bytenr - cache->key.objectid;
5687                 WARN_ON(byte_in_group > cache->key.offset);
5688
5689                 spin_lock(&cache->space_info->lock);
5690                 spin_lock(&cache->lock);
5691
5692                 if (btrfs_test_opt(root, SPACE_CACHE) &&
5693                     cache->disk_cache_state < BTRFS_DC_CLEAR)
5694                         cache->disk_cache_state = BTRFS_DC_CLEAR;
5695
5696                 old_val = btrfs_block_group_used(&cache->item);
5697                 num_bytes = min(total, cache->key.offset - byte_in_group);
5698                 if (alloc) {
5699                         old_val += num_bytes;
5700                         btrfs_set_block_group_used(&cache->item, old_val);
5701                         cache->reserved -= num_bytes;
5702                         cache->space_info->bytes_reserved -= num_bytes;
5703                         cache->space_info->bytes_used += num_bytes;
5704                         cache->space_info->disk_used += num_bytes * factor;
5705                         spin_unlock(&cache->lock);
5706                         spin_unlock(&cache->space_info->lock);
5707                 } else {
5708                         old_val -= num_bytes;
5709                         btrfs_set_block_group_used(&cache->item, old_val);
5710                         cache->pinned += num_bytes;
5711                         cache->space_info->bytes_pinned += num_bytes;
5712                         cache->space_info->bytes_used -= num_bytes;
5713                         cache->space_info->disk_used -= num_bytes * factor;
5714                         spin_unlock(&cache->lock);
5715                         spin_unlock(&cache->space_info->lock);
5716
5717                         set_extent_dirty(info->pinned_extents,
5718                                          bytenr, bytenr + num_bytes - 1,
5719                                          GFP_NOFS | __GFP_NOFAIL);
5720                         /*
5721                          * No longer have used bytes in this block group, queue
5722                          * it for deletion.
5723                          */
5724                         if (old_val == 0) {
5725                                 spin_lock(&info->unused_bgs_lock);
5726                                 if (list_empty(&cache->bg_list)) {
5727                                         btrfs_get_block_group(cache);
5728                                         list_add_tail(&cache->bg_list,
5729                                                       &info->unused_bgs);
5730                                 }
5731                                 spin_unlock(&info->unused_bgs_lock);
5732                         }
5733                 }
5734
5735                 spin_lock(&trans->transaction->dirty_bgs_lock);
5736                 if (list_empty(&cache->dirty_list)) {
5737                         list_add_tail(&cache->dirty_list,
5738                                       &trans->transaction->dirty_bgs);
5739                                 trans->transaction->num_dirty_bgs++;
5740                         btrfs_get_block_group(cache);
5741                 }
5742                 spin_unlock(&trans->transaction->dirty_bgs_lock);
5743
5744                 btrfs_put_block_group(cache);
5745                 total -= num_bytes;
5746                 bytenr += num_bytes;
5747         }
5748         return 0;
5749 }
5750
5751 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5752 {
5753         struct btrfs_block_group_cache *cache;
5754         u64 bytenr;
5755
5756         spin_lock(&root->fs_info->block_group_cache_lock);
5757         bytenr = root->fs_info->first_logical_byte;
5758         spin_unlock(&root->fs_info->block_group_cache_lock);
5759
5760         if (bytenr < (u64)-1)
5761                 return bytenr;
5762
5763         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5764         if (!cache)
5765                 return 0;
5766
5767         bytenr = cache->key.objectid;
5768         btrfs_put_block_group(cache);
5769
5770         return bytenr;
5771 }
5772
5773 static int pin_down_extent(struct btrfs_root *root,
5774                            struct btrfs_block_group_cache *cache,
5775                            u64 bytenr, u64 num_bytes, int reserved)
5776 {
5777         spin_lock(&cache->space_info->lock);
5778         spin_lock(&cache->lock);
5779         cache->pinned += num_bytes;
5780         cache->space_info->bytes_pinned += num_bytes;
5781         if (reserved) {
5782                 cache->reserved -= num_bytes;
5783                 cache->space_info->bytes_reserved -= num_bytes;
5784         }
5785         spin_unlock(&cache->lock);
5786         spin_unlock(&cache->space_info->lock);
5787
5788         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5789                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5790         if (reserved)
5791                 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
5792         return 0;
5793 }
5794
5795 /*
5796  * this function must be called within transaction
5797  */
5798 int btrfs_pin_extent(struct btrfs_root *root,
5799                      u64 bytenr, u64 num_bytes, int reserved)
5800 {
5801         struct btrfs_block_group_cache *cache;
5802
5803         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5804         BUG_ON(!cache); /* Logic error */
5805
5806         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5807
5808         btrfs_put_block_group(cache);
5809         return 0;
5810 }
5811
5812 /*
5813  * this function must be called within transaction
5814  */
5815 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5816                                     u64 bytenr, u64 num_bytes)
5817 {
5818         struct btrfs_block_group_cache *cache;
5819         int ret;
5820
5821         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5822         if (!cache)
5823                 return -EINVAL;
5824
5825         /*
5826          * pull in the free space cache (if any) so that our pin
5827          * removes the free space from the cache.  We have load_only set
5828          * to one because the slow code to read in the free extents does check
5829          * the pinned extents.
5830          */
5831         cache_block_group(cache, 1);
5832
5833         pin_down_extent(root, cache, bytenr, num_bytes, 0);
5834
5835         /* remove us from the free space cache (if we're there at all) */
5836         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5837         btrfs_put_block_group(cache);
5838         return ret;
5839 }
5840
5841 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5842 {
5843         int ret;
5844         struct btrfs_block_group_cache *block_group;
5845         struct btrfs_caching_control *caching_ctl;
5846
5847         block_group = btrfs_lookup_block_group(root->fs_info, start);
5848         if (!block_group)
5849                 return -EINVAL;
5850
5851         cache_block_group(block_group, 0);
5852         caching_ctl = get_caching_control(block_group);
5853
5854         if (!caching_ctl) {
5855                 /* Logic error */
5856                 BUG_ON(!block_group_cache_done(block_group));
5857                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5858         } else {
5859                 mutex_lock(&caching_ctl->mutex);
5860
5861                 if (start >= caching_ctl->progress) {
5862                         ret = add_excluded_extent(root, start, num_bytes);
5863                 } else if (start + num_bytes <= caching_ctl->progress) {
5864                         ret = btrfs_remove_free_space(block_group,
5865                                                       start, num_bytes);
5866                 } else {
5867                         num_bytes = caching_ctl->progress - start;
5868                         ret = btrfs_remove_free_space(block_group,
5869                                                       start, num_bytes);
5870                         if (ret)
5871                                 goto out_lock;
5872
5873                         num_bytes = (start + num_bytes) -
5874                                 caching_ctl->progress;
5875                         start = caching_ctl->progress;
5876                         ret = add_excluded_extent(root, start, num_bytes);
5877                 }
5878 out_lock:
5879                 mutex_unlock(&caching_ctl->mutex);
5880                 put_caching_control(caching_ctl);
5881         }
5882         btrfs_put_block_group(block_group);
5883         return ret;
5884 }
5885
5886 int btrfs_exclude_logged_extents(struct btrfs_root *log,
5887                                  struct extent_buffer *eb)
5888 {
5889         struct btrfs_file_extent_item *item;
5890         struct btrfs_key key;
5891         int found_type;
5892         int i;
5893
5894         if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5895                 return 0;
5896
5897         for (i = 0; i < btrfs_header_nritems(eb); i++) {
5898                 btrfs_item_key_to_cpu(eb, &key, i);
5899                 if (key.type != BTRFS_EXTENT_DATA_KEY)
5900                         continue;
5901                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5902                 found_type = btrfs_file_extent_type(eb, item);
5903                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5904                         continue;
5905                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5906                         continue;
5907                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5908                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5909                 __exclude_logged_extent(log, key.objectid, key.offset);
5910         }
5911
5912         return 0;
5913 }
5914
5915 /**
5916  * btrfs_update_reserved_bytes - update the block_group and space info counters
5917  * @cache:      The cache we are manipulating
5918  * @num_bytes:  The number of bytes in question
5919  * @reserve:    One of the reservation enums
5920  * @delalloc:   The blocks are allocated for the delalloc write
5921  *
5922  * This is called by the allocator when it reserves space, or by somebody who is
5923  * freeing space that was never actually used on disk.  For example if you
5924  * reserve some space for a new leaf in transaction A and before transaction A
5925  * commits you free that leaf, you call this with reserve set to 0 in order to
5926  * clear the reservation.
5927  *
5928  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5929  * ENOSPC accounting.  For data we handle the reservation through clearing the
5930  * delalloc bits in the io_tree.  We have to do this since we could end up
5931  * allocating less disk space for the amount of data we have reserved in the
5932  * case of compression.
5933  *
5934  * If this is a reservation and the block group has become read only we cannot
5935  * make the reservation and return -EAGAIN, otherwise this function always
5936  * succeeds.
5937  */
5938 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5939                                        u64 num_bytes, int reserve, int delalloc)
5940 {
5941         struct btrfs_space_info *space_info = cache->space_info;
5942         int ret = 0;
5943
5944         spin_lock(&space_info->lock);
5945         spin_lock(&cache->lock);
5946         if (reserve != RESERVE_FREE) {
5947                 if (cache->ro) {
5948                         ret = -EAGAIN;
5949                 } else {
5950                         cache->reserved += num_bytes;
5951                         space_info->bytes_reserved += num_bytes;
5952                         if (reserve == RESERVE_ALLOC) {
5953                                 trace_btrfs_space_reservation(cache->fs_info,
5954                                                 "space_info", space_info->flags,
5955                                                 num_bytes, 0);
5956                                 space_info->bytes_may_use -= num_bytes;
5957                         }
5958
5959                         if (delalloc)
5960                                 cache->delalloc_bytes += num_bytes;
5961                 }
5962         } else {
5963                 if (cache->ro)
5964                         space_info->bytes_readonly += num_bytes;
5965                 cache->reserved -= num_bytes;
5966                 space_info->bytes_reserved -= num_bytes;
5967
5968                 if (delalloc)
5969                         cache->delalloc_bytes -= num_bytes;
5970         }
5971         spin_unlock(&cache->lock);
5972         spin_unlock(&space_info->lock);
5973         return ret;
5974 }
5975
5976 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5977                                 struct btrfs_root *root)
5978 {
5979         struct btrfs_fs_info *fs_info = root->fs_info;
5980         struct btrfs_caching_control *next;
5981         struct btrfs_caching_control *caching_ctl;
5982         struct btrfs_block_group_cache *cache;
5983
5984         down_write(&fs_info->commit_root_sem);
5985
5986         list_for_each_entry_safe(caching_ctl, next,
5987                                  &fs_info->caching_block_groups, list) {
5988                 cache = caching_ctl->block_group;
5989                 if (block_group_cache_done(cache)) {
5990                         cache->last_byte_to_unpin = (u64)-1;
5991                         list_del_init(&caching_ctl->list);
5992                         put_caching_control(caching_ctl);
5993                 } else {
5994                         cache->last_byte_to_unpin = caching_ctl->progress;
5995                 }
5996         }
5997
5998         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5999                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6000         else
6001                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6002
6003         up_write(&fs_info->commit_root_sem);
6004
6005         update_global_block_rsv(fs_info);
6006 }
6007
6008 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6009                               const bool return_free_space)
6010 {
6011         struct btrfs_fs_info *fs_info = root->fs_info;
6012         struct btrfs_block_group_cache *cache = NULL;
6013         struct btrfs_space_info *space_info;
6014         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6015         u64 len;
6016         bool readonly;
6017
6018         while (start <= end) {
6019                 readonly = false;
6020                 if (!cache ||
6021                     start >= cache->key.objectid + cache->key.offset) {
6022                         if (cache)
6023                                 btrfs_put_block_group(cache);
6024                         cache = btrfs_lookup_block_group(fs_info, start);
6025                         BUG_ON(!cache); /* Logic error */
6026                 }
6027
6028                 len = cache->key.objectid + cache->key.offset - start;
6029                 len = min(len, end + 1 - start);
6030
6031                 if (start < cache->last_byte_to_unpin) {
6032                         len = min(len, cache->last_byte_to_unpin - start);
6033                         if (return_free_space)
6034                                 btrfs_add_free_space(cache, start, len);
6035                 }
6036
6037                 start += len;
6038                 space_info = cache->space_info;
6039
6040                 spin_lock(&space_info->lock);
6041                 spin_lock(&cache->lock);
6042                 cache->pinned -= len;
6043                 space_info->bytes_pinned -= len;
6044                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6045                 if (cache->ro) {
6046                         space_info->bytes_readonly += len;
6047                         readonly = true;
6048                 }
6049                 spin_unlock(&cache->lock);
6050                 if (!readonly && global_rsv->space_info == space_info) {
6051                         spin_lock(&global_rsv->lock);
6052                         if (!global_rsv->full) {
6053                                 len = min(len, global_rsv->size -
6054                                           global_rsv->reserved);
6055                                 global_rsv->reserved += len;
6056                                 space_info->bytes_may_use += len;
6057                                 if (global_rsv->reserved >= global_rsv->size)
6058                                         global_rsv->full = 1;
6059                         }
6060                         spin_unlock(&global_rsv->lock);
6061                 }
6062                 spin_unlock(&space_info->lock);
6063         }
6064
6065         if (cache)
6066                 btrfs_put_block_group(cache);
6067         return 0;
6068 }
6069
6070 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6071                                struct btrfs_root *root)
6072 {
6073         struct btrfs_fs_info *fs_info = root->fs_info;
6074         struct extent_io_tree *unpin;
6075         u64 start;
6076         u64 end;
6077         int ret;
6078
6079         if (trans->aborted)
6080                 return 0;
6081
6082         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6083                 unpin = &fs_info->freed_extents[1];
6084         else
6085                 unpin = &fs_info->freed_extents[0];
6086
6087         while (1) {
6088                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6089                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6090                                             EXTENT_DIRTY, NULL);
6091                 if (ret) {
6092                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6093                         break;
6094                 }
6095
6096                 if (btrfs_test_opt(root, DISCARD))
6097                         ret = btrfs_discard_extent(root, start,
6098                                                    end + 1 - start, NULL);
6099
6100                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
6101                 unpin_extent_range(root, start, end, true);
6102                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6103                 cond_resched();
6104         }
6105
6106         return 0;
6107 }
6108
6109 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6110                              u64 owner, u64 root_objectid)
6111 {
6112         struct btrfs_space_info *space_info;
6113         u64 flags;
6114
6115         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6116                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6117                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
6118                 else
6119                         flags = BTRFS_BLOCK_GROUP_METADATA;
6120         } else {
6121                 flags = BTRFS_BLOCK_GROUP_DATA;
6122         }
6123
6124         space_info = __find_space_info(fs_info, flags);
6125         BUG_ON(!space_info); /* Logic bug */
6126         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6127 }
6128
6129
6130 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6131                                 struct btrfs_root *root,
6132                                 struct btrfs_delayed_ref_node *node, u64 parent,
6133                                 u64 root_objectid, u64 owner_objectid,
6134                                 u64 owner_offset, int refs_to_drop,
6135                                 struct btrfs_delayed_extent_op *extent_op)
6136 {
6137         struct btrfs_key key;
6138         struct btrfs_path *path;
6139         struct btrfs_fs_info *info = root->fs_info;
6140         struct btrfs_root *extent_root = info->extent_root;
6141         struct extent_buffer *leaf;
6142         struct btrfs_extent_item *ei;
6143         struct btrfs_extent_inline_ref *iref;
6144         int ret;
6145         int is_data;
6146         int extent_slot = 0;
6147         int found_extent = 0;
6148         int num_to_del = 1;
6149         int no_quota = node->no_quota;
6150         u32 item_size;
6151         u64 refs;
6152         u64 bytenr = node->bytenr;
6153         u64 num_bytes = node->num_bytes;
6154         int last_ref = 0;
6155         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6156                                                  SKINNY_METADATA);
6157
6158         if (!info->quota_enabled || !is_fstree(root_objectid))
6159                 no_quota = 1;
6160
6161         path = btrfs_alloc_path();
6162         if (!path)
6163                 return -ENOMEM;
6164
6165         path->reada = 1;
6166         path->leave_spinning = 1;
6167
6168         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6169         BUG_ON(!is_data && refs_to_drop != 1);
6170
6171         if (is_data)
6172                 skinny_metadata = 0;
6173
6174         ret = lookup_extent_backref(trans, extent_root, path, &iref,
6175                                     bytenr, num_bytes, parent,
6176                                     root_objectid, owner_objectid,
6177                                     owner_offset);
6178         if (ret == 0) {
6179                 extent_slot = path->slots[0];
6180                 while (extent_slot >= 0) {
6181                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6182                                               extent_slot);
6183                         if (key.objectid != bytenr)
6184                                 break;
6185                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6186                             key.offset == num_bytes) {
6187                                 found_extent = 1;
6188                                 break;
6189                         }
6190                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6191                             key.offset == owner_objectid) {
6192                                 found_extent = 1;
6193                                 break;
6194                         }
6195                         if (path->slots[0] - extent_slot > 5)
6196                                 break;
6197                         extent_slot--;
6198                 }
6199 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6200                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6201                 if (found_extent && item_size < sizeof(*ei))
6202                         found_extent = 0;
6203 #endif
6204                 if (!found_extent) {
6205                         BUG_ON(iref);
6206                         ret = remove_extent_backref(trans, extent_root, path,
6207                                                     NULL, refs_to_drop,
6208                                                     is_data, &last_ref);
6209                         if (ret) {
6210                                 btrfs_abort_transaction(trans, extent_root, ret);
6211                                 goto out;
6212                         }
6213                         btrfs_release_path(path);
6214                         path->leave_spinning = 1;
6215
6216                         key.objectid = bytenr;
6217                         key.type = BTRFS_EXTENT_ITEM_KEY;
6218                         key.offset = num_bytes;
6219
6220                         if (!is_data && skinny_metadata) {
6221                                 key.type = BTRFS_METADATA_ITEM_KEY;
6222                                 key.offset = owner_objectid;
6223                         }
6224
6225                         ret = btrfs_search_slot(trans, extent_root,
6226                                                 &key, path, -1, 1);
6227                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6228                                 /*
6229                                  * Couldn't find our skinny metadata item,
6230                                  * see if we have ye olde extent item.
6231                                  */
6232                                 path->slots[0]--;
6233                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6234                                                       path->slots[0]);
6235                                 if (key.objectid == bytenr &&
6236                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6237                                     key.offset == num_bytes)
6238                                         ret = 0;
6239                         }
6240
6241                         if (ret > 0 && skinny_metadata) {
6242                                 skinny_metadata = false;
6243                                 key.objectid = bytenr;
6244                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6245                                 key.offset = num_bytes;
6246                                 btrfs_release_path(path);
6247                                 ret = btrfs_search_slot(trans, extent_root,
6248                                                         &key, path, -1, 1);
6249                         }
6250
6251                         if (ret) {
6252                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6253                                         ret, bytenr);
6254                                 if (ret > 0)
6255                                         btrfs_print_leaf(extent_root,
6256                                                          path->nodes[0]);
6257                         }
6258                         if (ret < 0) {
6259                                 btrfs_abort_transaction(trans, extent_root, ret);
6260                                 goto out;
6261                         }
6262                         extent_slot = path->slots[0];
6263                 }
6264         } else if (WARN_ON(ret == -ENOENT)) {
6265                 btrfs_print_leaf(extent_root, path->nodes[0]);
6266                 btrfs_err(info,
6267                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6268                         bytenr, parent, root_objectid, owner_objectid,
6269                         owner_offset);
6270                 btrfs_abort_transaction(trans, extent_root, ret);
6271                 goto out;
6272         } else {
6273                 btrfs_abort_transaction(trans, extent_root, ret);
6274                 goto out;
6275         }
6276
6277         leaf = path->nodes[0];
6278         item_size = btrfs_item_size_nr(leaf, extent_slot);
6279 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6280         if (item_size < sizeof(*ei)) {
6281                 BUG_ON(found_extent || extent_slot != path->slots[0]);
6282                 ret = convert_extent_item_v0(trans, extent_root, path,
6283                                              owner_objectid, 0);
6284                 if (ret < 0) {
6285                         btrfs_abort_transaction(trans, extent_root, ret);
6286                         goto out;
6287                 }
6288
6289                 btrfs_release_path(path);
6290                 path->leave_spinning = 1;
6291
6292                 key.objectid = bytenr;
6293                 key.type = BTRFS_EXTENT_ITEM_KEY;
6294                 key.offset = num_bytes;
6295
6296                 ret = btrfs_search_slot(trans, extent_root, &key, path,
6297                                         -1, 1);
6298                 if (ret) {
6299                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6300                                 ret, bytenr);
6301                         btrfs_print_leaf(extent_root, path->nodes[0]);
6302                 }
6303                 if (ret < 0) {
6304                         btrfs_abort_transaction(trans, extent_root, ret);
6305                         goto out;
6306                 }
6307
6308                 extent_slot = path->slots[0];
6309                 leaf = path->nodes[0];
6310                 item_size = btrfs_item_size_nr(leaf, extent_slot);
6311         }
6312 #endif
6313         BUG_ON(item_size < sizeof(*ei));
6314         ei = btrfs_item_ptr(leaf, extent_slot,
6315                             struct btrfs_extent_item);
6316         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6317             key.type == BTRFS_EXTENT_ITEM_KEY) {
6318                 struct btrfs_tree_block_info *bi;
6319                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6320                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6321                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6322         }
6323
6324         refs = btrfs_extent_refs(leaf, ei);
6325         if (refs < refs_to_drop) {
6326                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
6327                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
6328                 ret = -EINVAL;
6329                 btrfs_abort_transaction(trans, extent_root, ret);
6330                 goto out;
6331         }
6332         refs -= refs_to_drop;
6333
6334         if (refs > 0) {
6335                 if (extent_op)
6336                         __run_delayed_extent_op(extent_op, leaf, ei);
6337                 /*
6338                  * In the case of inline back ref, reference count will
6339                  * be updated by remove_extent_backref
6340                  */
6341                 if (iref) {
6342                         BUG_ON(!found_extent);
6343                 } else {
6344                         btrfs_set_extent_refs(leaf, ei, refs);
6345                         btrfs_mark_buffer_dirty(leaf);
6346                 }
6347                 if (found_extent) {
6348                         ret = remove_extent_backref(trans, extent_root, path,
6349                                                     iref, refs_to_drop,
6350                                                     is_data, &last_ref);
6351                         if (ret) {
6352                                 btrfs_abort_transaction(trans, extent_root, ret);
6353                                 goto out;
6354                         }
6355                 }
6356                 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
6357                                  root_objectid);
6358         } else {
6359                 if (found_extent) {
6360                         BUG_ON(is_data && refs_to_drop !=
6361                                extent_data_ref_count(root, path, iref));
6362                         if (iref) {
6363                                 BUG_ON(path->slots[0] != extent_slot);
6364                         } else {
6365                                 BUG_ON(path->slots[0] != extent_slot + 1);
6366                                 path->slots[0] = extent_slot;
6367                                 num_to_del = 2;
6368                         }
6369                 }
6370
6371                 last_ref = 1;
6372                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6373                                       num_to_del);
6374                 if (ret) {
6375                         btrfs_abort_transaction(trans, extent_root, ret);
6376                         goto out;
6377                 }
6378                 btrfs_release_path(path);
6379
6380                 if (is_data) {
6381                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
6382                         if (ret) {
6383                                 btrfs_abort_transaction(trans, extent_root, ret);
6384                                 goto out;
6385                         }
6386                 }
6387
6388                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
6389                 if (ret) {
6390                         btrfs_abort_transaction(trans, extent_root, ret);
6391                         goto out;
6392                 }
6393         }
6394         btrfs_release_path(path);
6395
6396 out:
6397         btrfs_free_path(path);
6398         return ret;
6399 }
6400
6401 /*
6402  * when we free an block, it is possible (and likely) that we free the last
6403  * delayed ref for that extent as well.  This searches the delayed ref tree for
6404  * a given extent, and if there are no other delayed refs to be processed, it
6405  * removes it from the tree.
6406  */
6407 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6408                                       struct btrfs_root *root, u64 bytenr)
6409 {
6410         struct btrfs_delayed_ref_head *head;
6411         struct btrfs_delayed_ref_root *delayed_refs;
6412         int ret = 0;
6413
6414         delayed_refs = &trans->transaction->delayed_refs;
6415         spin_lock(&delayed_refs->lock);
6416         head = btrfs_find_delayed_ref_head(trans, bytenr);
6417         if (!head)
6418                 goto out_delayed_unlock;
6419
6420         spin_lock(&head->lock);
6421         if (!list_empty(&head->ref_list))
6422                 goto out;
6423
6424         if (head->extent_op) {
6425                 if (!head->must_insert_reserved)
6426                         goto out;
6427                 btrfs_free_delayed_extent_op(head->extent_op);
6428                 head->extent_op = NULL;
6429         }
6430
6431         /*
6432          * waiting for the lock here would deadlock.  If someone else has it
6433          * locked they are already in the process of dropping it anyway
6434          */
6435         if (!mutex_trylock(&head->mutex))
6436                 goto out;
6437
6438         /*
6439          * at this point we have a head with no other entries.  Go
6440          * ahead and process it.
6441          */
6442         head->node.in_tree = 0;
6443         rb_erase(&head->href_node, &delayed_refs->href_root);
6444
6445         atomic_dec(&delayed_refs->num_entries);
6446
6447         /*
6448          * we don't take a ref on the node because we're removing it from the
6449          * tree, so we just steal the ref the tree was holding.
6450          */
6451         delayed_refs->num_heads--;
6452         if (head->processing == 0)
6453                 delayed_refs->num_heads_ready--;
6454         head->processing = 0;
6455         spin_unlock(&head->lock);
6456         spin_unlock(&delayed_refs->lock);
6457
6458         BUG_ON(head->extent_op);
6459         if (head->must_insert_reserved)
6460                 ret = 1;
6461
6462         mutex_unlock(&head->mutex);
6463         btrfs_put_delayed_ref(&head->node);
6464         return ret;
6465 out:
6466         spin_unlock(&head->lock);
6467
6468 out_delayed_unlock:
6469         spin_unlock(&delayed_refs->lock);
6470         return 0;
6471 }
6472
6473 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6474                            struct btrfs_root *root,
6475                            struct extent_buffer *buf,
6476                            u64 parent, int last_ref)
6477 {
6478         int pin = 1;
6479         int ret;
6480
6481         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6482                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6483                                         buf->start, buf->len,
6484                                         parent, root->root_key.objectid,
6485                                         btrfs_header_level(buf),
6486                                         BTRFS_DROP_DELAYED_REF, NULL, 0);
6487                 BUG_ON(ret); /* -ENOMEM */
6488         }
6489
6490         if (!last_ref)
6491                 return;
6492
6493         if (btrfs_header_generation(buf) == trans->transid) {
6494                 struct btrfs_block_group_cache *cache;
6495
6496                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6497                         ret = check_ref_cleanup(trans, root, buf->start);
6498                         if (!ret)
6499                                 goto out;
6500                 }
6501
6502                 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6503
6504                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6505                         pin_down_extent(root, cache, buf->start, buf->len, 1);
6506                         btrfs_put_block_group(cache);
6507                         goto out;
6508                 }
6509
6510                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6511
6512                 btrfs_add_free_space(cache, buf->start, buf->len);
6513                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6514                 btrfs_put_block_group(cache);
6515                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6516                 pin = 0;
6517         }
6518 out:
6519         if (pin)
6520                 add_pinned_bytes(root->fs_info, buf->len,
6521                                  btrfs_header_level(buf),
6522                                  root->root_key.objectid);
6523
6524         /*
6525          * Deleting the buffer, clear the corrupt flag since it doesn't matter
6526          * anymore.
6527          */
6528         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6529 }
6530
6531 /* Can return -ENOMEM */
6532 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6533                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6534                       u64 owner, u64 offset, int no_quota)
6535 {
6536         int ret;
6537         struct btrfs_fs_info *fs_info = root->fs_info;
6538
6539         if (btrfs_test_is_dummy_root(root))
6540                 return 0;
6541
6542         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6543
6544         /*
6545          * tree log blocks never actually go into the extent allocation
6546          * tree, just update pinning info and exit early.
6547          */
6548         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6549                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6550                 /* unlocks the pinned mutex */
6551                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
6552                 ret = 0;
6553         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6554                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6555                                         num_bytes,
6556                                         parent, root_objectid, (int)owner,
6557                                         BTRFS_DROP_DELAYED_REF, NULL, no_quota);
6558         } else {
6559                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6560                                                 num_bytes,
6561                                                 parent, root_objectid, owner,
6562                                                 offset, BTRFS_DROP_DELAYED_REF,
6563                                                 NULL, no_quota);
6564         }
6565         return ret;
6566 }
6567
6568 /*
6569  * when we wait for progress in the block group caching, its because
6570  * our allocation attempt failed at least once.  So, we must sleep
6571  * and let some progress happen before we try again.
6572  *
6573  * This function will sleep at least once waiting for new free space to
6574  * show up, and then it will check the block group free space numbers
6575  * for our min num_bytes.  Another option is to have it go ahead
6576  * and look in the rbtree for a free extent of a given size, but this
6577  * is a good start.
6578  *
6579  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6580  * any of the information in this block group.
6581  */
6582 static noinline void
6583 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6584                                 u64 num_bytes)
6585 {
6586         struct btrfs_caching_control *caching_ctl;
6587
6588         caching_ctl = get_caching_control(cache);
6589         if (!caching_ctl)
6590                 return;
6591
6592         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6593                    (cache->free_space_ctl->free_space >= num_bytes));
6594
6595         put_caching_control(caching_ctl);
6596 }
6597
6598 static noinline int
6599 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6600 {
6601         struct btrfs_caching_control *caching_ctl;
6602         int ret = 0;
6603
6604         caching_ctl = get_caching_control(cache);
6605         if (!caching_ctl)
6606                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6607
6608         wait_event(caching_ctl->wait, block_group_cache_done(cache));
6609         if (cache->cached == BTRFS_CACHE_ERROR)
6610                 ret = -EIO;
6611         put_caching_control(caching_ctl);
6612         return ret;
6613 }
6614
6615 int __get_raid_index(u64 flags)
6616 {
6617         if (flags & BTRFS_BLOCK_GROUP_RAID10)
6618                 return BTRFS_RAID_RAID10;
6619         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6620                 return BTRFS_RAID_RAID1;
6621         else if (flags & BTRFS_BLOCK_GROUP_DUP)
6622                 return BTRFS_RAID_DUP;
6623         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6624                 return BTRFS_RAID_RAID0;
6625         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6626                 return BTRFS_RAID_RAID5;
6627         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6628                 return BTRFS_RAID_RAID6;
6629
6630         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6631 }
6632
6633 int get_block_group_index(struct btrfs_block_group_cache *cache)
6634 {
6635         return __get_raid_index(cache->flags);
6636 }
6637
6638 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6639         [BTRFS_RAID_RAID10]     = "raid10",
6640         [BTRFS_RAID_RAID1]      = "raid1",
6641         [BTRFS_RAID_DUP]        = "dup",
6642         [BTRFS_RAID_RAID0]      = "raid0",
6643         [BTRFS_RAID_SINGLE]     = "single",
6644         [BTRFS_RAID_RAID5]      = "raid5",
6645         [BTRFS_RAID_RAID6]      = "raid6",
6646 };
6647
6648 static const char *get_raid_name(enum btrfs_raid_types type)
6649 {
6650         if (type >= BTRFS_NR_RAID_TYPES)
6651                 return NULL;
6652
6653         return btrfs_raid_type_names[type];
6654 }
6655
6656 enum btrfs_loop_type {
6657         LOOP_CACHING_NOWAIT = 0,
6658         LOOP_CACHING_WAIT = 1,
6659         LOOP_ALLOC_CHUNK = 2,
6660         LOOP_NO_EMPTY_SIZE = 3,
6661 };
6662
6663 static inline void
6664 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6665                        int delalloc)
6666 {
6667         if (delalloc)
6668                 down_read(&cache->data_rwsem);
6669 }
6670
6671 static inline void
6672 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6673                        int delalloc)
6674 {
6675         btrfs_get_block_group(cache);
6676         if (delalloc)
6677                 down_read(&cache->data_rwsem);
6678 }
6679
6680 static struct btrfs_block_group_cache *
6681 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6682                    struct btrfs_free_cluster *cluster,
6683                    int delalloc)
6684 {
6685         struct btrfs_block_group_cache *used_bg;
6686         bool locked = false;
6687 again:
6688         spin_lock(&cluster->refill_lock);
6689         if (locked) {
6690                 if (used_bg == cluster->block_group)
6691                         return used_bg;
6692
6693                 up_read(&used_bg->data_rwsem);
6694                 btrfs_put_block_group(used_bg);
6695         }
6696
6697         used_bg = cluster->block_group;
6698         if (!used_bg)
6699                 return NULL;
6700
6701         if (used_bg == block_group)
6702                 return used_bg;
6703
6704         btrfs_get_block_group(used_bg);
6705
6706         if (!delalloc)
6707                 return used_bg;
6708
6709         if (down_read_trylock(&used_bg->data_rwsem))
6710                 return used_bg;
6711
6712         spin_unlock(&cluster->refill_lock);
6713         down_read(&used_bg->data_rwsem);
6714         locked = true;
6715         goto again;
6716 }
6717
6718 static inline void
6719 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6720                          int delalloc)
6721 {
6722         if (delalloc)
6723                 up_read(&cache->data_rwsem);
6724         btrfs_put_block_group(cache);
6725 }
6726
6727 /*
6728  * walks the btree of allocated extents and find a hole of a given size.
6729  * The key ins is changed to record the hole:
6730  * ins->objectid == start position
6731  * ins->flags = BTRFS_EXTENT_ITEM_KEY
6732  * ins->offset == the size of the hole.
6733  * Any available blocks before search_start are skipped.
6734  *
6735  * If there is no suitable free space, we will record the max size of
6736  * the free space extent currently.
6737  */
6738 static noinline int find_free_extent(struct btrfs_root *orig_root,
6739                                      u64 num_bytes, u64 empty_size,
6740                                      u64 hint_byte, struct btrfs_key *ins,
6741                                      u64 flags, int delalloc)
6742 {
6743         int ret = 0;
6744         struct btrfs_root *root = orig_root->fs_info->extent_root;
6745         struct btrfs_free_cluster *last_ptr = NULL;
6746         struct btrfs_block_group_cache *block_group = NULL;
6747         u64 search_start = 0;
6748         u64 max_extent_size = 0;
6749         int empty_cluster = 2 * 1024 * 1024;
6750         struct btrfs_space_info *space_info;
6751         int loop = 0;
6752         int index = __get_raid_index(flags);
6753         int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6754                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6755         bool failed_cluster_refill = false;
6756         bool failed_alloc = false;
6757         bool use_cluster = true;
6758         bool have_caching_bg = false;
6759
6760         WARN_ON(num_bytes < root->sectorsize);
6761         ins->type = BTRFS_EXTENT_ITEM_KEY;
6762         ins->objectid = 0;
6763         ins->offset = 0;
6764
6765         trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
6766
6767         space_info = __find_space_info(root->fs_info, flags);
6768         if (!space_info) {
6769                 btrfs_err(root->fs_info, "No space info for %llu", flags);
6770                 return -ENOSPC;
6771         }
6772
6773         /*
6774          * If the space info is for both data and metadata it means we have a
6775          * small filesystem and we can't use the clustering stuff.
6776          */
6777         if (btrfs_mixed_space_info(space_info))
6778                 use_cluster = false;
6779
6780         if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
6781                 last_ptr = &root->fs_info->meta_alloc_cluster;
6782                 if (!btrfs_test_opt(root, SSD))
6783                         empty_cluster = 64 * 1024;
6784         }
6785
6786         if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
6787             btrfs_test_opt(root, SSD)) {
6788                 last_ptr = &root->fs_info->data_alloc_cluster;
6789         }
6790
6791         if (last_ptr) {
6792                 spin_lock(&last_ptr->lock);
6793                 if (last_ptr->block_group)
6794                         hint_byte = last_ptr->window_start;
6795                 spin_unlock(&last_ptr->lock);
6796         }
6797
6798         search_start = max(search_start, first_logical_byte(root, 0));
6799         search_start = max(search_start, hint_byte);
6800
6801         if (!last_ptr)
6802                 empty_cluster = 0;
6803
6804         if (search_start == hint_byte) {
6805                 block_group = btrfs_lookup_block_group(root->fs_info,
6806                                                        search_start);
6807                 /*
6808                  * we don't want to use the block group if it doesn't match our
6809                  * allocation bits, or if its not cached.
6810                  *
6811                  * However if we are re-searching with an ideal block group
6812                  * picked out then we don't care that the block group is cached.
6813                  */
6814                 if (block_group && block_group_bits(block_group, flags) &&
6815                     block_group->cached != BTRFS_CACHE_NO) {
6816                         down_read(&space_info->groups_sem);
6817                         if (list_empty(&block_group->list) ||
6818                             block_group->ro) {
6819                                 /*
6820                                  * someone is removing this block group,
6821                                  * we can't jump into the have_block_group
6822                                  * target because our list pointers are not
6823                                  * valid
6824                                  */
6825                                 btrfs_put_block_group(block_group);
6826                                 up_read(&space_info->groups_sem);
6827                         } else {
6828                                 index = get_block_group_index(block_group);
6829                                 btrfs_lock_block_group(block_group, delalloc);
6830                                 goto have_block_group;
6831                         }
6832                 } else if (block_group) {
6833                         btrfs_put_block_group(block_group);
6834                 }
6835         }
6836 search:
6837         have_caching_bg = false;
6838         down_read(&space_info->groups_sem);
6839         list_for_each_entry(block_group, &space_info->block_groups[index],
6840                             list) {
6841                 u64 offset;
6842                 int cached;
6843
6844                 btrfs_grab_block_group(block_group, delalloc);
6845                 search_start = block_group->key.objectid;
6846
6847                 /*
6848                  * this can happen if we end up cycling through all the
6849                  * raid types, but we want to make sure we only allocate
6850                  * for the proper type.
6851                  */
6852                 if (!block_group_bits(block_group, flags)) {
6853                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
6854                                 BTRFS_BLOCK_GROUP_RAID1 |
6855                                 BTRFS_BLOCK_GROUP_RAID5 |
6856                                 BTRFS_BLOCK_GROUP_RAID6 |
6857                                 BTRFS_BLOCK_GROUP_RAID10;
6858
6859                         /*
6860                          * if they asked for extra copies and this block group
6861                          * doesn't provide them, bail.  This does allow us to
6862                          * fill raid0 from raid1.
6863                          */
6864                         if ((flags & extra) && !(block_group->flags & extra))
6865                                 goto loop;
6866                 }
6867
6868 have_block_group:
6869                 cached = block_group_cache_done(block_group);
6870                 if (unlikely(!cached)) {
6871                         ret = cache_block_group(block_group, 0);
6872                         BUG_ON(ret < 0);
6873                         ret = 0;
6874                 }
6875
6876                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6877                         goto loop;
6878                 if (unlikely(block_group->ro))
6879                         goto loop;
6880
6881                 /*
6882                  * Ok we want to try and use the cluster allocator, so
6883                  * lets look there
6884                  */
6885                 if (last_ptr) {
6886                         struct btrfs_block_group_cache *used_block_group;
6887                         unsigned long aligned_cluster;
6888                         /*
6889                          * the refill lock keeps out other
6890                          * people trying to start a new cluster
6891                          */
6892                         used_block_group = btrfs_lock_cluster(block_group,
6893                                                               last_ptr,
6894                                                               delalloc);
6895                         if (!used_block_group)
6896                                 goto refill_cluster;
6897
6898                         if (used_block_group != block_group &&
6899                             (used_block_group->ro ||
6900                              !block_group_bits(used_block_group, flags)))
6901                                 goto release_cluster;
6902
6903                         offset = btrfs_alloc_from_cluster(used_block_group,
6904                                                 last_ptr,
6905                                                 num_bytes,
6906                                                 used_block_group->key.objectid,
6907                                                 &max_extent_size);
6908                         if (offset) {
6909                                 /* we have a block, we're done */
6910                                 spin_unlock(&last_ptr->refill_lock);
6911                                 trace_btrfs_reserve_extent_cluster(root,
6912                                                 used_block_group,
6913                                                 search_start, num_bytes);
6914                                 if (used_block_group != block_group) {
6915                                         btrfs_release_block_group(block_group,
6916                                                                   delalloc);
6917                                         block_group = used_block_group;
6918                                 }
6919                                 goto checks;
6920                         }
6921
6922                         WARN_ON(last_ptr->block_group != used_block_group);
6923 release_cluster:
6924                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6925                          * set up a new clusters, so lets just skip it
6926                          * and let the allocator find whatever block
6927                          * it can find.  If we reach this point, we
6928                          * will have tried the cluster allocator
6929                          * plenty of times and not have found
6930                          * anything, so we are likely way too
6931                          * fragmented for the clustering stuff to find
6932                          * anything.
6933                          *
6934                          * However, if the cluster is taken from the
6935                          * current block group, release the cluster
6936                          * first, so that we stand a better chance of
6937                          * succeeding in the unclustered
6938                          * allocation.  */
6939                         if (loop >= LOOP_NO_EMPTY_SIZE &&
6940                             used_block_group != block_group) {
6941                                 spin_unlock(&last_ptr->refill_lock);
6942                                 btrfs_release_block_group(used_block_group,
6943                                                           delalloc);
6944                                 goto unclustered_alloc;
6945                         }
6946
6947                         /*
6948                          * this cluster didn't work out, free it and
6949                          * start over
6950                          */
6951                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
6952
6953                         if (used_block_group != block_group)
6954                                 btrfs_release_block_group(used_block_group,
6955                                                           delalloc);
6956 refill_cluster:
6957                         if (loop >= LOOP_NO_EMPTY_SIZE) {
6958                                 spin_unlock(&last_ptr->refill_lock);
6959                                 goto unclustered_alloc;
6960                         }
6961
6962                         aligned_cluster = max_t(unsigned long,
6963                                                 empty_cluster + empty_size,
6964                                               block_group->full_stripe_len);
6965
6966                         /* allocate a cluster in this block group */
6967                         ret = btrfs_find_space_cluster(root, block_group,
6968                                                        last_ptr, search_start,
6969                                                        num_bytes,
6970                                                        aligned_cluster);
6971                         if (ret == 0) {
6972                                 /*
6973                                  * now pull our allocation out of this
6974                                  * cluster
6975                                  */
6976                                 offset = btrfs_alloc_from_cluster(block_group,
6977                                                         last_ptr,
6978                                                         num_bytes,
6979                                                         search_start,
6980                                                         &max_extent_size);
6981                                 if (offset) {
6982                                         /* we found one, proceed */
6983                                         spin_unlock(&last_ptr->refill_lock);
6984                                         trace_btrfs_reserve_extent_cluster(root,
6985                                                 block_group, search_start,
6986                                                 num_bytes);
6987                                         goto checks;
6988                                 }
6989                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
6990                                    && !failed_cluster_refill) {
6991                                 spin_unlock(&last_ptr->refill_lock);
6992
6993                                 failed_cluster_refill = true;
6994                                 wait_block_group_cache_progress(block_group,
6995                                        num_bytes + empty_cluster + empty_size);
6996                                 goto have_block_group;
6997                         }
6998
6999                         /*
7000                          * at this point we either didn't find a cluster
7001                          * or we weren't able to allocate a block from our
7002                          * cluster.  Free the cluster we've been trying
7003                          * to use, and go to the next block group
7004                          */
7005                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7006                         spin_unlock(&last_ptr->refill_lock);
7007                         goto loop;
7008                 }
7009
7010 unclustered_alloc:
7011                 spin_lock(&block_group->free_space_ctl->tree_lock);
7012                 if (cached &&
7013                     block_group->free_space_ctl->free_space <
7014                     num_bytes + empty_cluster + empty_size) {
7015                         if (block_group->free_space_ctl->free_space >
7016                             max_extent_size)
7017                                 max_extent_size =
7018                                         block_group->free_space_ctl->free_space;
7019                         spin_unlock(&block_group->free_space_ctl->tree_lock);
7020                         goto loop;
7021                 }
7022                 spin_unlock(&block_group->free_space_ctl->tree_lock);
7023
7024                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7025                                                     num_bytes, empty_size,
7026                                                     &max_extent_size);
7027                 /*
7028                  * If we didn't find a chunk, and we haven't failed on this
7029                  * block group before, and this block group is in the middle of
7030                  * caching and we are ok with waiting, then go ahead and wait
7031                  * for progress to be made, and set failed_alloc to true.
7032                  *
7033                  * If failed_alloc is true then we've already waited on this
7034                  * block group once and should move on to the next block group.
7035                  */
7036                 if (!offset && !failed_alloc && !cached &&
7037                     loop > LOOP_CACHING_NOWAIT) {
7038                         wait_block_group_cache_progress(block_group,
7039                                                 num_bytes + empty_size);
7040                         failed_alloc = true;
7041                         goto have_block_group;
7042                 } else if (!offset) {
7043                         if (!cached)
7044                                 have_caching_bg = true;
7045                         goto loop;
7046                 }
7047 checks:
7048                 search_start = ALIGN(offset, root->stripesize);
7049
7050                 /* move on to the next group */
7051                 if (search_start + num_bytes >
7052                     block_group->key.objectid + block_group->key.offset) {
7053                         btrfs_add_free_space(block_group, offset, num_bytes);
7054                         goto loop;
7055                 }
7056
7057                 if (offset < search_start)
7058                         btrfs_add_free_space(block_group, offset,
7059                                              search_start - offset);
7060                 BUG_ON(offset > search_start);
7061
7062                 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
7063                                                   alloc_type, delalloc);
7064                 if (ret == -EAGAIN) {
7065                         btrfs_add_free_space(block_group, offset, num_bytes);
7066                         goto loop;
7067                 }
7068
7069                 /* we are all good, lets return */
7070                 ins->objectid = search_start;
7071                 ins->offset = num_bytes;
7072
7073                 trace_btrfs_reserve_extent(orig_root, block_group,
7074                                            search_start, num_bytes);
7075                 btrfs_release_block_group(block_group, delalloc);
7076                 break;
7077 loop:
7078                 failed_cluster_refill = false;
7079                 failed_alloc = false;
7080                 BUG_ON(index != get_block_group_index(block_group));
7081                 btrfs_release_block_group(block_group, delalloc);
7082         }
7083         up_read(&space_info->groups_sem);
7084
7085         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7086                 goto search;
7087
7088         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7089                 goto search;
7090
7091         /*
7092          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7093          *                      caching kthreads as we move along
7094          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7095          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7096          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7097          *                      again
7098          */
7099         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7100                 index = 0;
7101                 loop++;
7102                 if (loop == LOOP_ALLOC_CHUNK) {
7103                         struct btrfs_trans_handle *trans;
7104                         int exist = 0;
7105
7106                         trans = current->journal_info;
7107                         if (trans)
7108                                 exist = 1;
7109                         else
7110                                 trans = btrfs_join_transaction(root);
7111
7112                         if (IS_ERR(trans)) {
7113                                 ret = PTR_ERR(trans);
7114                                 goto out;
7115                         }
7116
7117                         ret = do_chunk_alloc(trans, root, flags,
7118                                              CHUNK_ALLOC_FORCE);
7119                         /*
7120                          * Do not bail out on ENOSPC since we
7121                          * can do more things.
7122                          */
7123                         if (ret < 0 && ret != -ENOSPC)
7124                                 btrfs_abort_transaction(trans,
7125                                                         root, ret);
7126                         else
7127                                 ret = 0;
7128                         if (!exist)
7129                                 btrfs_end_transaction(trans, root);
7130                         if (ret)
7131                                 goto out;
7132                 }
7133
7134                 if (loop == LOOP_NO_EMPTY_SIZE) {
7135                         empty_size = 0;
7136                         empty_cluster = 0;
7137                 }
7138
7139                 goto search;
7140         } else if (!ins->objectid) {
7141                 ret = -ENOSPC;
7142         } else if (ins->objectid) {
7143                 ret = 0;
7144         }
7145 out:
7146         if (ret == -ENOSPC)
7147                 ins->offset = max_extent_size;
7148         return ret;
7149 }
7150
7151 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7152                             int dump_block_groups)
7153 {
7154         struct btrfs_block_group_cache *cache;
7155         int index = 0;
7156
7157         spin_lock(&info->lock);
7158         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7159                info->flags,
7160                info->total_bytes - info->bytes_used - info->bytes_pinned -
7161                info->bytes_reserved - info->bytes_readonly,
7162                (info->full) ? "" : "not ");
7163         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7164                "reserved=%llu, may_use=%llu, readonly=%llu\n",
7165                info->total_bytes, info->bytes_used, info->bytes_pinned,
7166                info->bytes_reserved, info->bytes_may_use,
7167                info->bytes_readonly);
7168         spin_unlock(&info->lock);
7169
7170         if (!dump_block_groups)
7171                 return;
7172
7173         down_read(&info->groups_sem);
7174 again:
7175         list_for_each_entry(cache, &info->block_groups[index], list) {
7176                 spin_lock(&cache->lock);
7177                 printk(KERN_INFO "BTRFS: "
7178                            "block group %llu has %llu bytes, "
7179                            "%llu used %llu pinned %llu reserved %s\n",
7180                        cache->key.objectid, cache->key.offset,
7181                        btrfs_block_group_used(&cache->item), cache->pinned,
7182                        cache->reserved, cache->ro ? "[readonly]" : "");
7183                 btrfs_dump_free_space(cache, bytes);
7184                 spin_unlock(&cache->lock);
7185         }
7186         if (++index < BTRFS_NR_RAID_TYPES)
7187                 goto again;
7188         up_read(&info->groups_sem);
7189 }
7190
7191 int btrfs_reserve_extent(struct btrfs_root *root,
7192                          u64 num_bytes, u64 min_alloc_size,
7193                          u64 empty_size, u64 hint_byte,
7194                          struct btrfs_key *ins, int is_data, int delalloc)
7195 {
7196         bool final_tried = false;
7197         u64 flags;
7198         int ret;
7199
7200         flags = btrfs_get_alloc_profile(root, is_data);
7201 again:
7202         WARN_ON(num_bytes < root->sectorsize);
7203         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7204                                flags, delalloc);
7205
7206         if (ret == -ENOSPC) {
7207                 if (!final_tried && ins->offset) {
7208                         num_bytes = min(num_bytes >> 1, ins->offset);
7209                         num_bytes = round_down(num_bytes, root->sectorsize);
7210                         num_bytes = max(num_bytes, min_alloc_size);
7211                         if (num_bytes == min_alloc_size)
7212                                 final_tried = true;
7213                         goto again;
7214                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7215                         struct btrfs_space_info *sinfo;
7216
7217                         sinfo = __find_space_info(root->fs_info, flags);
7218                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7219                                 flags, num_bytes);
7220                         if (sinfo)
7221                                 dump_space_info(sinfo, num_bytes, 1);
7222                 }
7223         }
7224
7225         return ret;
7226 }
7227
7228 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7229                                         u64 start, u64 len,
7230                                         int pin, int delalloc)
7231 {
7232         struct btrfs_block_group_cache *cache;
7233         int ret = 0;
7234
7235         cache = btrfs_lookup_block_group(root->fs_info, start);
7236         if (!cache) {
7237                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
7238                         start);
7239                 return -ENOSPC;
7240         }
7241
7242         if (pin)
7243                 pin_down_extent(root, cache, start, len, 1);
7244         else {
7245                 if (btrfs_test_opt(root, DISCARD))
7246                         ret = btrfs_discard_extent(root, start, len, NULL);
7247                 btrfs_add_free_space(cache, start, len);
7248                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
7249         }
7250
7251         btrfs_put_block_group(cache);
7252
7253         trace_btrfs_reserved_extent_free(root, start, len);
7254
7255         return ret;
7256 }
7257
7258 int btrfs_free_reserved_extent(struct btrfs_root *root,
7259                                u64 start, u64 len, int delalloc)
7260 {
7261         return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
7262 }
7263
7264 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
7265                                        u64 start, u64 len)
7266 {
7267         return __btrfs_free_reserved_extent(root, start, len, 1, 0);
7268 }
7269
7270 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7271                                       struct btrfs_root *root,
7272                                       u64 parent, u64 root_objectid,
7273                                       u64 flags, u64 owner, u64 offset,
7274                                       struct btrfs_key *ins, int ref_mod)
7275 {
7276         int ret;
7277         struct btrfs_fs_info *fs_info = root->fs_info;
7278         struct btrfs_extent_item *extent_item;
7279         struct btrfs_extent_inline_ref *iref;
7280         struct btrfs_path *path;
7281         struct extent_buffer *leaf;
7282         int type;
7283         u32 size;
7284
7285         if (parent > 0)
7286                 type = BTRFS_SHARED_DATA_REF_KEY;
7287         else
7288                 type = BTRFS_EXTENT_DATA_REF_KEY;
7289
7290         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7291
7292         path = btrfs_alloc_path();
7293         if (!path)
7294                 return -ENOMEM;
7295
7296         path->leave_spinning = 1;
7297         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7298                                       ins, size);
7299         if (ret) {
7300                 btrfs_free_path(path);
7301                 return ret;
7302         }
7303
7304         leaf = path->nodes[0];
7305         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7306                                      struct btrfs_extent_item);
7307         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7308         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7309         btrfs_set_extent_flags(leaf, extent_item,
7310                                flags | BTRFS_EXTENT_FLAG_DATA);
7311
7312         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7313         btrfs_set_extent_inline_ref_type(leaf, iref, type);
7314         if (parent > 0) {
7315                 struct btrfs_shared_data_ref *ref;
7316                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7317                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7318                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7319         } else {
7320                 struct btrfs_extent_data_ref *ref;
7321                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7322                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7323                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7324                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7325                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7326         }
7327
7328         btrfs_mark_buffer_dirty(path->nodes[0]);
7329         btrfs_free_path(path);
7330
7331         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
7332         if (ret) { /* -ENOENT, logic error */
7333                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7334                         ins->objectid, ins->offset);
7335                 BUG();
7336         }
7337         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
7338         return ret;
7339 }
7340
7341 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7342                                      struct btrfs_root *root,
7343                                      u64 parent, u64 root_objectid,
7344                                      u64 flags, struct btrfs_disk_key *key,
7345                                      int level, struct btrfs_key *ins,
7346                                      int no_quota)
7347 {
7348         int ret;
7349         struct btrfs_fs_info *fs_info = root->fs_info;
7350         struct btrfs_extent_item *extent_item;
7351         struct btrfs_tree_block_info *block_info;
7352         struct btrfs_extent_inline_ref *iref;
7353         struct btrfs_path *path;
7354         struct extent_buffer *leaf;
7355         u32 size = sizeof(*extent_item) + sizeof(*iref);
7356         u64 num_bytes = ins->offset;
7357         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7358                                                  SKINNY_METADATA);
7359
7360         if (!skinny_metadata)
7361                 size += sizeof(*block_info);
7362
7363         path = btrfs_alloc_path();
7364         if (!path) {
7365                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7366                                                    root->nodesize);
7367                 return -ENOMEM;
7368         }
7369
7370         path->leave_spinning = 1;
7371         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7372                                       ins, size);
7373         if (ret) {
7374                 btrfs_free_path(path);
7375                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7376                                                    root->nodesize);
7377                 return ret;
7378         }
7379
7380         leaf = path->nodes[0];
7381         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7382                                      struct btrfs_extent_item);
7383         btrfs_set_extent_refs(leaf, extent_item, 1);
7384         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7385         btrfs_set_extent_flags(leaf, extent_item,
7386                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7387
7388         if (skinny_metadata) {
7389                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7390                 num_bytes = root->nodesize;
7391         } else {
7392                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7393                 btrfs_set_tree_block_key(leaf, block_info, key);
7394                 btrfs_set_tree_block_level(leaf, block_info, level);
7395                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7396         }
7397
7398         if (parent > 0) {
7399                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7400                 btrfs_set_extent_inline_ref_type(leaf, iref,
7401                                                  BTRFS_SHARED_BLOCK_REF_KEY);
7402                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7403         } else {
7404                 btrfs_set_extent_inline_ref_type(leaf, iref,
7405                                                  BTRFS_TREE_BLOCK_REF_KEY);
7406                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
7407         }
7408
7409         btrfs_mark_buffer_dirty(leaf);
7410         btrfs_free_path(path);
7411
7412         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
7413                                  1);
7414         if (ret) { /* -ENOENT, logic error */
7415                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7416                         ins->objectid, ins->offset);
7417                 BUG();
7418         }
7419
7420         trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7421         return ret;
7422 }
7423
7424 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7425                                      struct btrfs_root *root,
7426                                      u64 root_objectid, u64 owner,
7427                                      u64 offset, struct btrfs_key *ins)
7428 {
7429         int ret;
7430
7431         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
7432
7433         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7434                                          ins->offset, 0,
7435                                          root_objectid, owner, offset,
7436                                          BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
7437         return ret;
7438 }
7439
7440 /*
7441  * this is used by the tree logging recovery code.  It records that
7442  * an extent has been allocated and makes sure to clear the free
7443  * space cache bits as well
7444  */
7445 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7446                                    struct btrfs_root *root,
7447                                    u64 root_objectid, u64 owner, u64 offset,
7448                                    struct btrfs_key *ins)
7449 {
7450         int ret;
7451         struct btrfs_block_group_cache *block_group;
7452
7453         /*
7454          * Mixed block groups will exclude before processing the log so we only
7455          * need to do the exlude dance if this fs isn't mixed.
7456          */
7457         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
7458                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
7459                 if (ret)
7460                         return ret;
7461         }
7462
7463         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
7464         if (!block_group)
7465                 return -EINVAL;
7466
7467         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7468                                           RESERVE_ALLOC_NO_ACCOUNT, 0);
7469         BUG_ON(ret); /* logic error */
7470         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7471                                          0, owner, offset, ins, 1);
7472         btrfs_put_block_group(block_group);
7473         return ret;
7474 }
7475
7476 static struct extent_buffer *
7477 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7478                       u64 bytenr, int level)
7479 {
7480         struct extent_buffer *buf;
7481
7482         buf = btrfs_find_create_tree_block(root, bytenr);
7483         if (!buf)
7484                 return ERR_PTR(-ENOMEM);
7485         btrfs_set_header_generation(buf, trans->transid);
7486         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7487         btrfs_tree_lock(buf);
7488         clean_tree_block(trans, root->fs_info, buf);
7489         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7490
7491         btrfs_set_lock_blocking(buf);
7492         btrfs_set_buffer_uptodate(buf);
7493
7494         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7495                 buf->log_index = root->log_transid % 2;
7496                 /*
7497                  * we allow two log transactions at a time, use different
7498                  * EXENT bit to differentiate dirty pages.
7499                  */
7500                 if (buf->log_index == 0)
7501                         set_extent_dirty(&root->dirty_log_pages, buf->start,
7502                                         buf->start + buf->len - 1, GFP_NOFS);
7503                 else
7504                         set_extent_new(&root->dirty_log_pages, buf->start,
7505                                         buf->start + buf->len - 1, GFP_NOFS);
7506         } else {
7507                 buf->log_index = -1;
7508                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7509                          buf->start + buf->len - 1, GFP_NOFS);
7510         }
7511         trans->blocks_used++;
7512         /* this returns a buffer locked for blocking */
7513         return buf;
7514 }
7515
7516 static struct btrfs_block_rsv *
7517 use_block_rsv(struct btrfs_trans_handle *trans,
7518               struct btrfs_root *root, u32 blocksize)
7519 {
7520         struct btrfs_block_rsv *block_rsv;
7521         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
7522         int ret;
7523         bool global_updated = false;
7524
7525         block_rsv = get_block_rsv(trans, root);
7526
7527         if (unlikely(block_rsv->size == 0))
7528                 goto try_reserve;
7529 again:
7530         ret = block_rsv_use_bytes(block_rsv, blocksize);
7531         if (!ret)
7532                 return block_rsv;
7533
7534         if (block_rsv->failfast)
7535                 return ERR_PTR(ret);
7536
7537         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7538                 global_updated = true;
7539                 update_global_block_rsv(root->fs_info);
7540                 goto again;
7541         }
7542
7543         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7544                 static DEFINE_RATELIMIT_STATE(_rs,
7545                                 DEFAULT_RATELIMIT_INTERVAL * 10,
7546                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
7547                 if (__ratelimit(&_rs))
7548                         WARN(1, KERN_DEBUG
7549                                 "BTRFS: block rsv returned %d\n", ret);
7550         }
7551 try_reserve:
7552         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
7553                                      BTRFS_RESERVE_NO_FLUSH);
7554         if (!ret)
7555                 return block_rsv;
7556         /*
7557          * If we couldn't reserve metadata bytes try and use some from
7558          * the global reserve if its space type is the same as the global
7559          * reservation.
7560          */
7561         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7562             block_rsv->space_info == global_rsv->space_info) {
7563                 ret = block_rsv_use_bytes(global_rsv, blocksize);
7564                 if (!ret)
7565                         return global_rsv;
7566         }
7567         return ERR_PTR(ret);
7568 }
7569
7570 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7571                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
7572 {
7573         block_rsv_add_bytes(block_rsv, blocksize, 0);
7574         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
7575 }
7576
7577 /*
7578  * finds a free extent and does all the dirty work required for allocation
7579  * returns the key for the extent through ins, and a tree buffer for
7580  * the first block of the extent through buf.
7581  *
7582  * returns the tree buffer or an ERR_PTR on error.
7583  */
7584 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7585                                         struct btrfs_root *root,
7586                                         u64 parent, u64 root_objectid,
7587                                         struct btrfs_disk_key *key, int level,
7588                                         u64 hint, u64 empty_size)
7589 {
7590         struct btrfs_key ins;
7591         struct btrfs_block_rsv *block_rsv;
7592         struct extent_buffer *buf;
7593         struct btrfs_delayed_extent_op *extent_op;
7594         u64 flags = 0;
7595         int ret;
7596         u32 blocksize = root->nodesize;
7597         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7598                                                  SKINNY_METADATA);
7599
7600         if (btrfs_test_is_dummy_root(root)) {
7601                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7602                                             level);
7603                 if (!IS_ERR(buf))
7604                         root->alloc_bytenr += blocksize;
7605                 return buf;
7606         }
7607
7608         block_rsv = use_block_rsv(trans, root, blocksize);
7609         if (IS_ERR(block_rsv))
7610                 return ERR_CAST(block_rsv);
7611
7612         ret = btrfs_reserve_extent(root, blocksize, blocksize,
7613                                    empty_size, hint, &ins, 0, 0);
7614         if (ret)
7615                 goto out_unuse;
7616
7617         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
7618         if (IS_ERR(buf)) {
7619                 ret = PTR_ERR(buf);
7620                 goto out_free_reserved;
7621         }
7622
7623         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7624                 if (parent == 0)
7625                         parent = ins.objectid;
7626                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7627         } else
7628                 BUG_ON(parent > 0);
7629
7630         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7631                 extent_op = btrfs_alloc_delayed_extent_op();
7632                 if (!extent_op) {
7633                         ret = -ENOMEM;
7634                         goto out_free_buf;
7635                 }
7636                 if (key)
7637                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
7638                 else
7639                         memset(&extent_op->key, 0, sizeof(extent_op->key));
7640                 extent_op->flags_to_set = flags;
7641                 if (skinny_metadata)
7642                         extent_op->update_key = 0;
7643                 else
7644                         extent_op->update_key = 1;
7645                 extent_op->update_flags = 1;
7646                 extent_op->is_data = 0;
7647                 extent_op->level = level;
7648
7649                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7650                                                  ins.objectid, ins.offset,
7651                                                  parent, root_objectid, level,
7652                                                  BTRFS_ADD_DELAYED_EXTENT,
7653                                                  extent_op, 0);
7654                 if (ret)
7655                         goto out_free_delayed;
7656         }
7657         return buf;
7658
7659 out_free_delayed:
7660         btrfs_free_delayed_extent_op(extent_op);
7661 out_free_buf:
7662         free_extent_buffer(buf);
7663 out_free_reserved:
7664         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
7665 out_unuse:
7666         unuse_block_rsv(root->fs_info, block_rsv, blocksize);
7667         return ERR_PTR(ret);
7668 }
7669
7670 struct walk_control {
7671         u64 refs[BTRFS_MAX_LEVEL];
7672         u64 flags[BTRFS_MAX_LEVEL];
7673         struct btrfs_key update_progress;
7674         int stage;
7675         int level;
7676         int shared_level;
7677         int update_ref;
7678         int keep_locks;
7679         int reada_slot;
7680         int reada_count;
7681         int for_reloc;
7682 };
7683
7684 #define DROP_REFERENCE  1
7685 #define UPDATE_BACKREF  2
7686
7687 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7688                                      struct btrfs_root *root,
7689                                      struct walk_control *wc,
7690                                      struct btrfs_path *path)
7691 {
7692         u64 bytenr;
7693         u64 generation;
7694         u64 refs;
7695         u64 flags;
7696         u32 nritems;
7697         u32 blocksize;
7698         struct btrfs_key key;
7699         struct extent_buffer *eb;
7700         int ret;
7701         int slot;
7702         int nread = 0;
7703
7704         if (path->slots[wc->level] < wc->reada_slot) {
7705                 wc->reada_count = wc->reada_count * 2 / 3;
7706                 wc->reada_count = max(wc->reada_count, 2);
7707         } else {
7708                 wc->reada_count = wc->reada_count * 3 / 2;
7709                 wc->reada_count = min_t(int, wc->reada_count,
7710                                         BTRFS_NODEPTRS_PER_BLOCK(root));
7711         }
7712
7713         eb = path->nodes[wc->level];
7714         nritems = btrfs_header_nritems(eb);
7715         blocksize = root->nodesize;
7716
7717         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7718                 if (nread >= wc->reada_count)
7719                         break;
7720
7721                 cond_resched();
7722                 bytenr = btrfs_node_blockptr(eb, slot);
7723                 generation = btrfs_node_ptr_generation(eb, slot);
7724
7725                 if (slot == path->slots[wc->level])
7726                         goto reada;
7727
7728                 if (wc->stage == UPDATE_BACKREF &&
7729                     generation <= root->root_key.offset)
7730                         continue;
7731
7732                 /* We don't lock the tree block, it's OK to be racy here */
7733                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
7734                                                wc->level - 1, 1, &refs,
7735                                                &flags);
7736                 /* We don't care about errors in readahead. */
7737                 if (ret < 0)
7738                         continue;
7739                 BUG_ON(refs == 0);
7740
7741                 if (wc->stage == DROP_REFERENCE) {
7742                         if (refs == 1)
7743                                 goto reada;
7744
7745                         if (wc->level == 1 &&
7746                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7747                                 continue;
7748                         if (!wc->update_ref ||
7749                             generation <= root->root_key.offset)
7750                                 continue;
7751                         btrfs_node_key_to_cpu(eb, &key, slot);
7752                         ret = btrfs_comp_cpu_keys(&key,
7753                                                   &wc->update_progress);
7754                         if (ret < 0)
7755                                 continue;
7756                 } else {
7757                         if (wc->level == 1 &&
7758                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7759                                 continue;
7760                 }
7761 reada:
7762                 readahead_tree_block(root, bytenr);
7763                 nread++;
7764         }
7765         wc->reada_slot = slot;
7766 }
7767
7768 /*
7769  * TODO: Modify related function to add related node/leaf to dirty_extent_root,
7770  * for later qgroup accounting.
7771  *
7772  * Current, this function does nothing.
7773  */
7774 static int account_leaf_items(struct btrfs_trans_handle *trans,
7775                               struct btrfs_root *root,
7776                               struct extent_buffer *eb)
7777 {
7778         int nr = btrfs_header_nritems(eb);
7779         int i, extent_type;
7780         struct btrfs_key key;
7781         struct btrfs_file_extent_item *fi;
7782         u64 bytenr, num_bytes;
7783
7784         for (i = 0; i < nr; i++) {
7785                 btrfs_item_key_to_cpu(eb, &key, i);
7786
7787                 if (key.type != BTRFS_EXTENT_DATA_KEY)
7788                         continue;
7789
7790                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7791                 /* filter out non qgroup-accountable extents  */
7792                 extent_type = btrfs_file_extent_type(eb, fi);
7793
7794                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7795                         continue;
7796
7797                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7798                 if (!bytenr)
7799                         continue;
7800
7801                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7802         }
7803         return 0;
7804 }
7805
7806 /*
7807  * Walk up the tree from the bottom, freeing leaves and any interior
7808  * nodes which have had all slots visited. If a node (leaf or
7809  * interior) is freed, the node above it will have it's slot
7810  * incremented. The root node will never be freed.
7811  *
7812  * At the end of this function, we should have a path which has all
7813  * slots incremented to the next position for a search. If we need to
7814  * read a new node it will be NULL and the node above it will have the
7815  * correct slot selected for a later read.
7816  *
7817  * If we increment the root nodes slot counter past the number of
7818  * elements, 1 is returned to signal completion of the search.
7819  */
7820 static int adjust_slots_upwards(struct btrfs_root *root,
7821                                 struct btrfs_path *path, int root_level)
7822 {
7823         int level = 0;
7824         int nr, slot;
7825         struct extent_buffer *eb;
7826
7827         if (root_level == 0)
7828                 return 1;
7829
7830         while (level <= root_level) {
7831                 eb = path->nodes[level];
7832                 nr = btrfs_header_nritems(eb);
7833                 path->slots[level]++;
7834                 slot = path->slots[level];
7835                 if (slot >= nr || level == 0) {
7836                         /*
7837                          * Don't free the root -  we will detect this
7838                          * condition after our loop and return a
7839                          * positive value for caller to stop walking the tree.
7840                          */
7841                         if (level != root_level) {
7842                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
7843                                 path->locks[level] = 0;
7844
7845                                 free_extent_buffer(eb);
7846                                 path->nodes[level] = NULL;
7847                                 path->slots[level] = 0;
7848                         }
7849                 } else {
7850                         /*
7851                          * We have a valid slot to walk back down
7852                          * from. Stop here so caller can process these
7853                          * new nodes.
7854                          */
7855                         break;
7856                 }
7857
7858                 level++;
7859         }
7860
7861         eb = path->nodes[root_level];
7862         if (path->slots[root_level] >= btrfs_header_nritems(eb))
7863                 return 1;
7864
7865         return 0;
7866 }
7867
7868 /*
7869  * root_eb is the subtree root and is locked before this function is called.
7870  * TODO: Modify this function to mark all (including complete shared node)
7871  * to dirty_extent_root to allow it get accounted in qgroup.
7872  */
7873 static int account_shared_subtree(struct btrfs_trans_handle *trans,
7874                                   struct btrfs_root *root,
7875                                   struct extent_buffer *root_eb,
7876                                   u64 root_gen,
7877                                   int root_level)
7878 {
7879         int ret = 0;
7880         int level;
7881         struct extent_buffer *eb = root_eb;
7882         struct btrfs_path *path = NULL;
7883
7884         BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7885         BUG_ON(root_eb == NULL);
7886
7887         if (!root->fs_info->quota_enabled)
7888                 return 0;
7889
7890         if (!extent_buffer_uptodate(root_eb)) {
7891                 ret = btrfs_read_buffer(root_eb, root_gen);
7892                 if (ret)
7893                         goto out;
7894         }
7895
7896         if (root_level == 0) {
7897                 ret = account_leaf_items(trans, root, root_eb);
7898                 goto out;
7899         }
7900
7901         path = btrfs_alloc_path();
7902         if (!path)
7903                 return -ENOMEM;
7904
7905         /*
7906          * Walk down the tree.  Missing extent blocks are filled in as
7907          * we go. Metadata is accounted every time we read a new
7908          * extent block.
7909          *
7910          * When we reach a leaf, we account for file extent items in it,
7911          * walk back up the tree (adjusting slot pointers as we go)
7912          * and restart the search process.
7913          */
7914         extent_buffer_get(root_eb); /* For path */
7915         path->nodes[root_level] = root_eb;
7916         path->slots[root_level] = 0;
7917         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
7918 walk_down:
7919         level = root_level;
7920         while (level >= 0) {
7921                 if (path->nodes[level] == NULL) {
7922                         int parent_slot;
7923                         u64 child_gen;
7924                         u64 child_bytenr;
7925
7926                         /* We need to get child blockptr/gen from
7927                          * parent before we can read it. */
7928                         eb = path->nodes[level + 1];
7929                         parent_slot = path->slots[level + 1];
7930                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7931                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7932
7933                         eb = read_tree_block(root, child_bytenr, child_gen);
7934                         if (IS_ERR(eb)) {
7935                                 ret = PTR_ERR(eb);
7936                                 goto out;
7937                         } else if (!extent_buffer_uptodate(eb)) {
7938                                 free_extent_buffer(eb);
7939                                 ret = -EIO;
7940                                 goto out;
7941                         }
7942
7943                         path->nodes[level] = eb;
7944                         path->slots[level] = 0;
7945
7946                         btrfs_tree_read_lock(eb);
7947                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
7948                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
7949                 }
7950
7951                 if (level == 0) {
7952                         ret = account_leaf_items(trans, root, path->nodes[level]);
7953                         if (ret)
7954                                 goto out;
7955
7956                         /* Nonzero return here means we completed our search */
7957                         ret = adjust_slots_upwards(root, path, root_level);
7958                         if (ret)
7959                                 break;
7960
7961                         /* Restart search with new slots */
7962                         goto walk_down;
7963                 }
7964
7965                 level--;
7966         }
7967
7968         ret = 0;
7969 out:
7970         btrfs_free_path(path);
7971
7972         return ret;
7973 }
7974
7975 /*
7976  * helper to process tree block while walking down the tree.
7977  *
7978  * when wc->stage == UPDATE_BACKREF, this function updates
7979  * back refs for pointers in the block.
7980  *
7981  * NOTE: return value 1 means we should stop walking down.
7982  */
7983 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7984                                    struct btrfs_root *root,
7985                                    struct btrfs_path *path,
7986                                    struct walk_control *wc, int lookup_info)
7987 {
7988         int level = wc->level;
7989         struct extent_buffer *eb = path->nodes[level];
7990         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7991         int ret;
7992
7993         if (wc->stage == UPDATE_BACKREF &&
7994             btrfs_header_owner(eb) != root->root_key.objectid)
7995                 return 1;
7996
7997         /*
7998          * when reference count of tree block is 1, it won't increase
7999          * again. once full backref flag is set, we never clear it.
8000          */
8001         if (lookup_info &&
8002             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8003              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8004                 BUG_ON(!path->locks[level]);
8005                 ret = btrfs_lookup_extent_info(trans, root,
8006                                                eb->start, level, 1,
8007                                                &wc->refs[level],
8008                                                &wc->flags[level]);
8009                 BUG_ON(ret == -ENOMEM);
8010                 if (ret)
8011                         return ret;
8012                 BUG_ON(wc->refs[level] == 0);
8013         }
8014
8015         if (wc->stage == DROP_REFERENCE) {
8016                 if (wc->refs[level] > 1)
8017                         return 1;
8018
8019                 if (path->locks[level] && !wc->keep_locks) {
8020                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8021                         path->locks[level] = 0;
8022                 }
8023                 return 0;
8024         }
8025
8026         /* wc->stage == UPDATE_BACKREF */
8027         if (!(wc->flags[level] & flag)) {
8028                 BUG_ON(!path->locks[level]);
8029                 ret = btrfs_inc_ref(trans, root, eb, 1);
8030                 BUG_ON(ret); /* -ENOMEM */
8031                 ret = btrfs_dec_ref(trans, root, eb, 0);
8032                 BUG_ON(ret); /* -ENOMEM */
8033                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8034                                                   eb->len, flag,
8035                                                   btrfs_header_level(eb), 0);
8036                 BUG_ON(ret); /* -ENOMEM */
8037                 wc->flags[level] |= flag;
8038         }
8039
8040         /*
8041          * the block is shared by multiple trees, so it's not good to
8042          * keep the tree lock
8043          */
8044         if (path->locks[level] && level > 0) {
8045                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8046                 path->locks[level] = 0;
8047         }
8048         return 0;
8049 }
8050
8051 /*
8052  * helper to process tree block pointer.
8053  *
8054  * when wc->stage == DROP_REFERENCE, this function checks
8055  * reference count of the block pointed to. if the block
8056  * is shared and we need update back refs for the subtree
8057  * rooted at the block, this function changes wc->stage to
8058  * UPDATE_BACKREF. if the block is shared and there is no
8059  * need to update back, this function drops the reference
8060  * to the block.
8061  *
8062  * NOTE: return value 1 means we should stop walking down.
8063  */
8064 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8065                                  struct btrfs_root *root,
8066                                  struct btrfs_path *path,
8067                                  struct walk_control *wc, int *lookup_info)
8068 {
8069         u64 bytenr;
8070         u64 generation;
8071         u64 parent;
8072         u32 blocksize;
8073         struct btrfs_key key;
8074         struct extent_buffer *next;
8075         int level = wc->level;
8076         int reada = 0;
8077         int ret = 0;
8078         bool need_account = false;
8079
8080         generation = btrfs_node_ptr_generation(path->nodes[level],
8081                                                path->slots[level]);
8082         /*
8083          * if the lower level block was created before the snapshot
8084          * was created, we know there is no need to update back refs
8085          * for the subtree
8086          */
8087         if (wc->stage == UPDATE_BACKREF &&
8088             generation <= root->root_key.offset) {
8089                 *lookup_info = 1;
8090                 return 1;
8091         }
8092
8093         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8094         blocksize = root->nodesize;
8095
8096         next = btrfs_find_tree_block(root->fs_info, bytenr);
8097         if (!next) {
8098                 next = btrfs_find_create_tree_block(root, bytenr);
8099                 if (!next)
8100                         return -ENOMEM;
8101                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8102                                                level - 1);
8103                 reada = 1;
8104         }
8105         btrfs_tree_lock(next);
8106         btrfs_set_lock_blocking(next);
8107
8108         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8109                                        &wc->refs[level - 1],
8110                                        &wc->flags[level - 1]);
8111         if (ret < 0) {
8112                 btrfs_tree_unlock(next);
8113                 return ret;
8114         }
8115
8116         if (unlikely(wc->refs[level - 1] == 0)) {
8117                 btrfs_err(root->fs_info, "Missing references.");
8118                 BUG();
8119         }
8120         *lookup_info = 0;
8121
8122         if (wc->stage == DROP_REFERENCE) {
8123                 if (wc->refs[level - 1] > 1) {
8124                         need_account = true;
8125                         if (level == 1 &&
8126                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8127                                 goto skip;
8128
8129                         if (!wc->update_ref ||
8130                             generation <= root->root_key.offset)
8131                                 goto skip;
8132
8133                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8134                                               path->slots[level]);
8135                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8136                         if (ret < 0)
8137                                 goto skip;
8138
8139                         wc->stage = UPDATE_BACKREF;
8140                         wc->shared_level = level - 1;
8141                 }
8142         } else {
8143                 if (level == 1 &&
8144                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8145                         goto skip;
8146         }
8147
8148         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8149                 btrfs_tree_unlock(next);
8150                 free_extent_buffer(next);
8151                 next = NULL;
8152                 *lookup_info = 1;
8153         }
8154
8155         if (!next) {
8156                 if (reada && level == 1)
8157                         reada_walk_down(trans, root, wc, path);
8158                 next = read_tree_block(root, bytenr, generation);
8159                 if (IS_ERR(next)) {
8160                         return PTR_ERR(next);
8161                 } else if (!extent_buffer_uptodate(next)) {
8162                         free_extent_buffer(next);
8163                         return -EIO;
8164                 }
8165                 btrfs_tree_lock(next);
8166                 btrfs_set_lock_blocking(next);
8167         }
8168
8169         level--;
8170         BUG_ON(level != btrfs_header_level(next));
8171         path->nodes[level] = next;
8172         path->slots[level] = 0;
8173         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8174         wc->level = level;
8175         if (wc->level == 1)
8176                 wc->reada_slot = 0;
8177         return 0;
8178 skip:
8179         wc->refs[level - 1] = 0;
8180         wc->flags[level - 1] = 0;
8181         if (wc->stage == DROP_REFERENCE) {
8182                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8183                         parent = path->nodes[level]->start;
8184                 } else {
8185                         BUG_ON(root->root_key.objectid !=
8186                                btrfs_header_owner(path->nodes[level]));
8187                         parent = 0;
8188                 }
8189
8190                 if (need_account) {
8191                         ret = account_shared_subtree(trans, root, next,
8192                                                      generation, level - 1);
8193                         if (ret) {
8194                                 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8195                                         "%d accounting shared subtree. Quota "
8196                                         "is out of sync, rescan required.\n",
8197                                         root->fs_info->sb->s_id, ret);
8198                         }
8199                 }
8200                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8201                                 root->root_key.objectid, level - 1, 0, 0);
8202                 BUG_ON(ret); /* -ENOMEM */
8203         }
8204         btrfs_tree_unlock(next);
8205         free_extent_buffer(next);
8206         *lookup_info = 1;
8207         return 1;
8208 }
8209
8210 /*
8211  * helper to process tree block while walking up the tree.
8212  *
8213  * when wc->stage == DROP_REFERENCE, this function drops
8214  * reference count on the block.
8215  *
8216  * when wc->stage == UPDATE_BACKREF, this function changes
8217  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8218  * to UPDATE_BACKREF previously while processing the block.
8219  *
8220  * NOTE: return value 1 means we should stop walking up.
8221  */
8222 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8223                                  struct btrfs_root *root,
8224                                  struct btrfs_path *path,
8225                                  struct walk_control *wc)
8226 {
8227         int ret;
8228         int level = wc->level;
8229         struct extent_buffer *eb = path->nodes[level];
8230         u64 parent = 0;
8231
8232         if (wc->stage == UPDATE_BACKREF) {
8233                 BUG_ON(wc->shared_level < level);
8234                 if (level < wc->shared_level)
8235                         goto out;
8236
8237                 ret = find_next_key(path, level + 1, &wc->update_progress);
8238                 if (ret > 0)
8239                         wc->update_ref = 0;
8240
8241                 wc->stage = DROP_REFERENCE;
8242                 wc->shared_level = -1;
8243                 path->slots[level] = 0;
8244
8245                 /*
8246                  * check reference count again if the block isn't locked.
8247                  * we should start walking down the tree again if reference
8248                  * count is one.
8249                  */
8250                 if (!path->locks[level]) {
8251                         BUG_ON(level == 0);
8252                         btrfs_tree_lock(eb);
8253                         btrfs_set_lock_blocking(eb);
8254                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8255
8256                         ret = btrfs_lookup_extent_info(trans, root,
8257                                                        eb->start, level, 1,
8258                                                        &wc->refs[level],
8259                                                        &wc->flags[level]);
8260                         if (ret < 0) {
8261                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8262                                 path->locks[level] = 0;
8263                                 return ret;
8264                         }
8265                         BUG_ON(wc->refs[level] == 0);
8266                         if (wc->refs[level] == 1) {
8267                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8268                                 path->locks[level] = 0;
8269                                 return 1;
8270                         }
8271                 }
8272         }
8273
8274         /* wc->stage == DROP_REFERENCE */
8275         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8276
8277         if (wc->refs[level] == 1) {
8278                 if (level == 0) {
8279                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8280                                 ret = btrfs_dec_ref(trans, root, eb, 1);
8281                         else
8282                                 ret = btrfs_dec_ref(trans, root, eb, 0);
8283                         BUG_ON(ret); /* -ENOMEM */
8284                         ret = account_leaf_items(trans, root, eb);
8285                         if (ret) {
8286                                 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8287                                         "%d accounting leaf items. Quota "
8288                                         "is out of sync, rescan required.\n",
8289                                         root->fs_info->sb->s_id, ret);
8290                         }
8291                 }
8292                 /* make block locked assertion in clean_tree_block happy */
8293                 if (!path->locks[level] &&
8294                     btrfs_header_generation(eb) == trans->transid) {
8295                         btrfs_tree_lock(eb);
8296                         btrfs_set_lock_blocking(eb);
8297                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8298                 }
8299                 clean_tree_block(trans, root->fs_info, eb);
8300         }
8301
8302         if (eb == root->node) {
8303                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8304                         parent = eb->start;
8305                 else
8306                         BUG_ON(root->root_key.objectid !=
8307                                btrfs_header_owner(eb));
8308         } else {
8309                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8310                         parent = path->nodes[level + 1]->start;
8311                 else
8312                         BUG_ON(root->root_key.objectid !=
8313                                btrfs_header_owner(path->nodes[level + 1]));
8314         }
8315
8316         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8317 out:
8318         wc->refs[level] = 0;
8319         wc->flags[level] = 0;
8320         return 0;
8321 }
8322
8323 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8324                                    struct btrfs_root *root,
8325                                    struct btrfs_path *path,
8326                                    struct walk_control *wc)
8327 {
8328         int level = wc->level;
8329         int lookup_info = 1;
8330         int ret;
8331
8332         while (level >= 0) {
8333                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8334                 if (ret > 0)
8335                         break;
8336
8337                 if (level == 0)
8338                         break;
8339
8340                 if (path->slots[level] >=
8341                     btrfs_header_nritems(path->nodes[level]))
8342                         break;
8343
8344                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8345                 if (ret > 0) {
8346                         path->slots[level]++;
8347                         continue;
8348                 } else if (ret < 0)
8349                         return ret;
8350                 level = wc->level;
8351         }
8352         return 0;
8353 }
8354
8355 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8356                                  struct btrfs_root *root,
8357                                  struct btrfs_path *path,
8358                                  struct walk_control *wc, int max_level)
8359 {
8360         int level = wc->level;
8361         int ret;
8362
8363         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8364         while (level < max_level && path->nodes[level]) {
8365                 wc->level = level;
8366                 if (path->slots[level] + 1 <
8367                     btrfs_header_nritems(path->nodes[level])) {
8368                         path->slots[level]++;
8369                         return 0;
8370                 } else {
8371                         ret = walk_up_proc(trans, root, path, wc);
8372                         if (ret > 0)
8373                                 return 0;
8374
8375                         if (path->locks[level]) {
8376                                 btrfs_tree_unlock_rw(path->nodes[level],
8377                                                      path->locks[level]);
8378                                 path->locks[level] = 0;
8379                         }
8380                         free_extent_buffer(path->nodes[level]);
8381                         path->nodes[level] = NULL;
8382                         level++;
8383                 }
8384         }
8385         return 1;
8386 }
8387
8388 /*
8389  * drop a subvolume tree.
8390  *
8391  * this function traverses the tree freeing any blocks that only
8392  * referenced by the tree.
8393  *
8394  * when a shared tree block is found. this function decreases its
8395  * reference count by one. if update_ref is true, this function
8396  * also make sure backrefs for the shared block and all lower level
8397  * blocks are properly updated.
8398  *
8399  * If called with for_reloc == 0, may exit early with -EAGAIN
8400  */
8401 int btrfs_drop_snapshot(struct btrfs_root *root,
8402                          struct btrfs_block_rsv *block_rsv, int update_ref,
8403                          int for_reloc)
8404 {
8405         struct btrfs_path *path;
8406         struct btrfs_trans_handle *trans;
8407         struct btrfs_root *tree_root = root->fs_info->tree_root;
8408         struct btrfs_root_item *root_item = &root->root_item;
8409         struct walk_control *wc;
8410         struct btrfs_key key;
8411         int err = 0;
8412         int ret;
8413         int level;
8414         bool root_dropped = false;
8415
8416         btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8417
8418         path = btrfs_alloc_path();
8419         if (!path) {
8420                 err = -ENOMEM;
8421                 goto out;
8422         }
8423
8424         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8425         if (!wc) {
8426                 btrfs_free_path(path);
8427                 err = -ENOMEM;
8428                 goto out;
8429         }
8430
8431         trans = btrfs_start_transaction(tree_root, 0);
8432         if (IS_ERR(trans)) {
8433                 err = PTR_ERR(trans);
8434                 goto out_free;
8435         }
8436
8437         if (block_rsv)
8438                 trans->block_rsv = block_rsv;
8439
8440         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8441                 level = btrfs_header_level(root->node);
8442                 path->nodes[level] = btrfs_lock_root_node(root);
8443                 btrfs_set_lock_blocking(path->nodes[level]);
8444                 path->slots[level] = 0;
8445                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8446                 memset(&wc->update_progress, 0,
8447                        sizeof(wc->update_progress));
8448         } else {
8449                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8450                 memcpy(&wc->update_progress, &key,
8451                        sizeof(wc->update_progress));
8452
8453                 level = root_item->drop_level;
8454                 BUG_ON(level == 0);
8455                 path->lowest_level = level;
8456                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8457                 path->lowest_level = 0;
8458                 if (ret < 0) {
8459                         err = ret;
8460                         goto out_end_trans;
8461                 }
8462                 WARN_ON(ret > 0);
8463
8464                 /*
8465                  * unlock our path, this is safe because only this
8466                  * function is allowed to delete this snapshot
8467                  */
8468                 btrfs_unlock_up_safe(path, 0);
8469
8470                 level = btrfs_header_level(root->node);
8471                 while (1) {
8472                         btrfs_tree_lock(path->nodes[level]);
8473                         btrfs_set_lock_blocking(path->nodes[level]);
8474                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8475
8476                         ret = btrfs_lookup_extent_info(trans, root,
8477                                                 path->nodes[level]->start,
8478                                                 level, 1, &wc->refs[level],
8479                                                 &wc->flags[level]);
8480                         if (ret < 0) {
8481                                 err = ret;
8482                                 goto out_end_trans;
8483                         }
8484                         BUG_ON(wc->refs[level] == 0);
8485
8486                         if (level == root_item->drop_level)
8487                                 break;
8488
8489                         btrfs_tree_unlock(path->nodes[level]);
8490                         path->locks[level] = 0;
8491                         WARN_ON(wc->refs[level] != 1);
8492                         level--;
8493                 }
8494         }
8495
8496         wc->level = level;
8497         wc->shared_level = -1;
8498         wc->stage = DROP_REFERENCE;
8499         wc->update_ref = update_ref;
8500         wc->keep_locks = 0;
8501         wc->for_reloc = for_reloc;
8502         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8503
8504         while (1) {
8505
8506                 ret = walk_down_tree(trans, root, path, wc);
8507                 if (ret < 0) {
8508                         err = ret;
8509                         break;
8510                 }
8511
8512                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8513                 if (ret < 0) {
8514                         err = ret;
8515                         break;
8516                 }
8517
8518                 if (ret > 0) {
8519                         BUG_ON(wc->stage != DROP_REFERENCE);
8520                         break;
8521                 }
8522
8523                 if (wc->stage == DROP_REFERENCE) {
8524                         level = wc->level;
8525                         btrfs_node_key(path->nodes[level],
8526                                        &root_item->drop_progress,
8527                                        path->slots[level]);
8528                         root_item->drop_level = level;
8529                 }
8530
8531                 BUG_ON(wc->level == 0);
8532                 if (btrfs_should_end_transaction(trans, tree_root) ||
8533                     (!for_reloc && btrfs_need_cleaner_sleep(root))) {
8534                         ret = btrfs_update_root(trans, tree_root,
8535                                                 &root->root_key,
8536                                                 root_item);
8537                         if (ret) {
8538                                 btrfs_abort_transaction(trans, tree_root, ret);
8539                                 err = ret;
8540                                 goto out_end_trans;
8541                         }
8542
8543                         btrfs_end_transaction_throttle(trans, tree_root);
8544                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8545                                 pr_debug("BTRFS: drop snapshot early exit\n");
8546                                 err = -EAGAIN;
8547                                 goto out_free;
8548                         }
8549
8550                         trans = btrfs_start_transaction(tree_root, 0);
8551                         if (IS_ERR(trans)) {
8552                                 err = PTR_ERR(trans);
8553                                 goto out_free;
8554                         }
8555                         if (block_rsv)
8556                                 trans->block_rsv = block_rsv;
8557                 }
8558         }
8559         btrfs_release_path(path);
8560         if (err)
8561                 goto out_end_trans;
8562
8563         ret = btrfs_del_root(trans, tree_root, &root->root_key);
8564         if (ret) {
8565                 btrfs_abort_transaction(trans, tree_root, ret);
8566                 goto out_end_trans;
8567         }
8568
8569         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8570                 ret = btrfs_find_root(tree_root, &root->root_key, path,
8571                                       NULL, NULL);
8572                 if (ret < 0) {
8573                         btrfs_abort_transaction(trans, tree_root, ret);
8574                         err = ret;
8575                         goto out_end_trans;
8576                 } else if (ret > 0) {
8577                         /* if we fail to delete the orphan item this time
8578                          * around, it'll get picked up the next time.
8579                          *
8580                          * The most common failure here is just -ENOENT.
8581                          */
8582                         btrfs_del_orphan_item(trans, tree_root,
8583                                               root->root_key.objectid);
8584                 }
8585         }
8586
8587         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
8588                 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
8589         } else {
8590                 free_extent_buffer(root->node);
8591                 free_extent_buffer(root->commit_root);
8592                 btrfs_put_fs_root(root);
8593         }
8594         root_dropped = true;
8595 out_end_trans:
8596         btrfs_end_transaction_throttle(trans, tree_root);
8597 out_free:
8598         kfree(wc);
8599         btrfs_free_path(path);
8600 out:
8601         /*
8602          * So if we need to stop dropping the snapshot for whatever reason we
8603          * need to make sure to add it back to the dead root list so that we
8604          * keep trying to do the work later.  This also cleans up roots if we
8605          * don't have it in the radix (like when we recover after a power fail
8606          * or unmount) so we don't leak memory.
8607          */
8608         if (!for_reloc && root_dropped == false)
8609                 btrfs_add_dead_root(root);
8610         if (err && err != -EAGAIN)
8611                 btrfs_std_error(root->fs_info, err);
8612         return err;
8613 }
8614
8615 /*
8616  * drop subtree rooted at tree block 'node'.
8617  *
8618  * NOTE: this function will unlock and release tree block 'node'
8619  * only used by relocation code
8620  */
8621 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
8622                         struct btrfs_root *root,
8623                         struct extent_buffer *node,
8624                         struct extent_buffer *parent)
8625 {
8626         struct btrfs_path *path;
8627         struct walk_control *wc;
8628         int level;
8629         int parent_level;
8630         int ret = 0;
8631         int wret;
8632
8633         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
8634
8635         path = btrfs_alloc_path();
8636         if (!path)
8637                 return -ENOMEM;
8638
8639         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8640         if (!wc) {
8641                 btrfs_free_path(path);
8642                 return -ENOMEM;
8643         }
8644
8645         btrfs_assert_tree_locked(parent);
8646         parent_level = btrfs_header_level(parent);
8647         extent_buffer_get(parent);
8648         path->nodes[parent_level] = parent;
8649         path->slots[parent_level] = btrfs_header_nritems(parent);
8650
8651         btrfs_assert_tree_locked(node);
8652         level = btrfs_header_level(node);
8653         path->nodes[level] = node;
8654         path->slots[level] = 0;
8655         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8656
8657         wc->refs[parent_level] = 1;
8658         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8659         wc->level = level;
8660         wc->shared_level = -1;
8661         wc->stage = DROP_REFERENCE;
8662         wc->update_ref = 0;
8663         wc->keep_locks = 1;
8664         wc->for_reloc = 1;
8665         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8666
8667         while (1) {
8668                 wret = walk_down_tree(trans, root, path, wc);
8669                 if (wret < 0) {
8670                         ret = wret;
8671                         break;
8672                 }
8673
8674                 wret = walk_up_tree(trans, root, path, wc, parent_level);
8675                 if (wret < 0)
8676                         ret = wret;
8677                 if (wret != 0)
8678                         break;
8679         }
8680
8681         kfree(wc);
8682         btrfs_free_path(path);
8683         return ret;
8684 }
8685
8686 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8687 {
8688         u64 num_devices;
8689         u64 stripped;
8690
8691         /*
8692          * if restripe for this chunk_type is on pick target profile and
8693          * return, otherwise do the usual balance
8694          */
8695         stripped = get_restripe_target(root->fs_info, flags);
8696         if (stripped)
8697                 return extended_to_chunk(stripped);
8698
8699         num_devices = root->fs_info->fs_devices->rw_devices;
8700
8701         stripped = BTRFS_BLOCK_GROUP_RAID0 |
8702                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
8703                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
8704
8705         if (num_devices == 1) {
8706                 stripped |= BTRFS_BLOCK_GROUP_DUP;
8707                 stripped = flags & ~stripped;
8708
8709                 /* turn raid0 into single device chunks */
8710                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
8711                         return stripped;
8712
8713                 /* turn mirroring into duplication */
8714                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
8715                              BTRFS_BLOCK_GROUP_RAID10))
8716                         return stripped | BTRFS_BLOCK_GROUP_DUP;
8717         } else {
8718                 /* they already had raid on here, just return */
8719                 if (flags & stripped)
8720                         return flags;
8721
8722                 stripped |= BTRFS_BLOCK_GROUP_DUP;
8723                 stripped = flags & ~stripped;
8724
8725                 /* switch duplicated blocks with raid1 */
8726                 if (flags & BTRFS_BLOCK_GROUP_DUP)
8727                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
8728
8729                 /* this is drive concat, leave it alone */
8730         }
8731
8732         return flags;
8733 }
8734
8735 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8736 {
8737         struct btrfs_space_info *sinfo = cache->space_info;
8738         u64 num_bytes;
8739         u64 min_allocable_bytes;
8740         int ret = -ENOSPC;
8741
8742
8743         /*
8744          * We need some metadata space and system metadata space for
8745          * allocating chunks in some corner cases until we force to set
8746          * it to be readonly.
8747          */
8748         if ((sinfo->flags &
8749              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
8750             !force)
8751                 min_allocable_bytes = 1 * 1024 * 1024;
8752         else
8753                 min_allocable_bytes = 0;
8754
8755         spin_lock(&sinfo->lock);
8756         spin_lock(&cache->lock);
8757
8758         if (cache->ro) {
8759                 ret = 0;
8760                 goto out;
8761         }
8762
8763         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8764                     cache->bytes_super - btrfs_block_group_used(&cache->item);
8765
8766         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
8767             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
8768             min_allocable_bytes <= sinfo->total_bytes) {
8769                 sinfo->bytes_readonly += num_bytes;
8770                 cache->ro = 1;
8771                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8772                 ret = 0;
8773         }
8774 out:
8775         spin_unlock(&cache->lock);
8776         spin_unlock(&sinfo->lock);
8777         return ret;
8778 }
8779
8780 int btrfs_set_block_group_ro(struct btrfs_root *root,
8781                              struct btrfs_block_group_cache *cache)
8782
8783 {
8784         struct btrfs_trans_handle *trans;
8785         u64 alloc_flags;
8786         int ret;
8787
8788         BUG_ON(cache->ro);
8789
8790 again:
8791         trans = btrfs_join_transaction(root);
8792         if (IS_ERR(trans))
8793                 return PTR_ERR(trans);
8794
8795         /*
8796          * we're not allowed to set block groups readonly after the dirty
8797          * block groups cache has started writing.  If it already started,
8798          * back off and let this transaction commit
8799          */
8800         mutex_lock(&root->fs_info->ro_block_group_mutex);
8801         if (trans->transaction->dirty_bg_run) {
8802                 u64 transid = trans->transid;
8803
8804                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8805                 btrfs_end_transaction(trans, root);
8806
8807                 ret = btrfs_wait_for_commit(root, transid);
8808                 if (ret)
8809                         return ret;
8810                 goto again;
8811         }
8812
8813         /*
8814          * if we are changing raid levels, try to allocate a corresponding
8815          * block group with the new raid level.
8816          */
8817         alloc_flags = update_block_group_flags(root, cache->flags);
8818         if (alloc_flags != cache->flags) {
8819                 ret = do_chunk_alloc(trans, root, alloc_flags,
8820                                      CHUNK_ALLOC_FORCE);
8821                 /*
8822                  * ENOSPC is allowed here, we may have enough space
8823                  * already allocated at the new raid level to
8824                  * carry on
8825                  */
8826                 if (ret == -ENOSPC)
8827                         ret = 0;
8828                 if (ret < 0)
8829                         goto out;
8830         }
8831
8832         ret = set_block_group_ro(cache, 0);
8833         if (!ret)
8834                 goto out;
8835         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8836         ret = do_chunk_alloc(trans, root, alloc_flags,
8837                              CHUNK_ALLOC_FORCE);
8838         if (ret < 0)
8839                 goto out;
8840         ret = set_block_group_ro(cache, 0);
8841 out:
8842         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
8843                 alloc_flags = update_block_group_flags(root, cache->flags);
8844                 lock_chunks(root->fs_info->chunk_root);
8845                 check_system_chunk(trans, root, alloc_flags);
8846                 unlock_chunks(root->fs_info->chunk_root);
8847         }
8848         mutex_unlock(&root->fs_info->ro_block_group_mutex);
8849
8850         btrfs_end_transaction(trans, root);
8851         return ret;
8852 }
8853
8854 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8855                             struct btrfs_root *root, u64 type)
8856 {
8857         u64 alloc_flags = get_alloc_profile(root, type);
8858         return do_chunk_alloc(trans, root, alloc_flags,
8859                               CHUNK_ALLOC_FORCE);
8860 }
8861
8862 /*
8863  * helper to account the unused space of all the readonly block group in the
8864  * space_info. takes mirrors into account.
8865  */
8866 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8867 {
8868         struct btrfs_block_group_cache *block_group;
8869         u64 free_bytes = 0;
8870         int factor;
8871
8872         /* It's df, we don't care if it's racey */
8873         if (list_empty(&sinfo->ro_bgs))
8874                 return 0;
8875
8876         spin_lock(&sinfo->lock);
8877         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8878                 spin_lock(&block_group->lock);
8879
8880                 if (!block_group->ro) {
8881                         spin_unlock(&block_group->lock);
8882                         continue;
8883                 }
8884
8885                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8886                                           BTRFS_BLOCK_GROUP_RAID10 |
8887                                           BTRFS_BLOCK_GROUP_DUP))
8888                         factor = 2;
8889                 else
8890                         factor = 1;
8891
8892                 free_bytes += (block_group->key.offset -
8893                                btrfs_block_group_used(&block_group->item)) *
8894                                factor;
8895
8896                 spin_unlock(&block_group->lock);
8897         }
8898         spin_unlock(&sinfo->lock);
8899
8900         return free_bytes;
8901 }
8902
8903 void btrfs_set_block_group_rw(struct btrfs_root *root,
8904                               struct btrfs_block_group_cache *cache)
8905 {
8906         struct btrfs_space_info *sinfo = cache->space_info;
8907         u64 num_bytes;
8908
8909         BUG_ON(!cache->ro);
8910
8911         spin_lock(&sinfo->lock);
8912         spin_lock(&cache->lock);
8913         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8914                     cache->bytes_super - btrfs_block_group_used(&cache->item);
8915         sinfo->bytes_readonly -= num_bytes;
8916         cache->ro = 0;
8917         list_del_init(&cache->ro_list);
8918         spin_unlock(&cache->lock);
8919         spin_unlock(&sinfo->lock);
8920 }
8921
8922 /*
8923  * checks to see if its even possible to relocate this block group.
8924  *
8925  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8926  * ok to go ahead and try.
8927  */
8928 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8929 {
8930         struct btrfs_block_group_cache *block_group;
8931         struct btrfs_space_info *space_info;
8932         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
8933         struct btrfs_device *device;
8934         struct btrfs_trans_handle *trans;
8935         u64 min_free;
8936         u64 dev_min = 1;
8937         u64 dev_nr = 0;
8938         u64 target;
8939         int index;
8940         int full = 0;
8941         int ret = 0;
8942
8943         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
8944
8945         /* odd, couldn't find the block group, leave it alone */
8946         if (!block_group)
8947                 return -1;
8948
8949         min_free = btrfs_block_group_used(&block_group->item);
8950
8951         /* no bytes used, we're good */
8952         if (!min_free)
8953                 goto out;
8954
8955         space_info = block_group->space_info;
8956         spin_lock(&space_info->lock);
8957
8958         full = space_info->full;
8959
8960         /*
8961          * if this is the last block group we have in this space, we can't
8962          * relocate it unless we're able to allocate a new chunk below.
8963          *
8964          * Otherwise, we need to make sure we have room in the space to handle
8965          * all of the extents from this block group.  If we can, we're good
8966          */
8967         if ((space_info->total_bytes != block_group->key.offset) &&
8968             (space_info->bytes_used + space_info->bytes_reserved +
8969              space_info->bytes_pinned + space_info->bytes_readonly +
8970              min_free < space_info->total_bytes)) {
8971                 spin_unlock(&space_info->lock);
8972                 goto out;
8973         }
8974         spin_unlock(&space_info->lock);
8975
8976         /*
8977          * ok we don't have enough space, but maybe we have free space on our
8978          * devices to allocate new chunks for relocation, so loop through our
8979          * alloc devices and guess if we have enough space.  if this block
8980          * group is going to be restriped, run checks against the target
8981          * profile instead of the current one.
8982          */
8983         ret = -1;
8984
8985         /*
8986          * index:
8987          *      0: raid10
8988          *      1: raid1
8989          *      2: dup
8990          *      3: raid0
8991          *      4: single
8992          */
8993         target = get_restripe_target(root->fs_info, block_group->flags);
8994         if (target) {
8995                 index = __get_raid_index(extended_to_chunk(target));
8996         } else {
8997                 /*
8998                  * this is just a balance, so if we were marked as full
8999                  * we know there is no space for a new chunk
9000                  */
9001                 if (full)
9002                         goto out;
9003
9004                 index = get_block_group_index(block_group);
9005         }
9006
9007         if (index == BTRFS_RAID_RAID10) {
9008                 dev_min = 4;
9009                 /* Divide by 2 */
9010                 min_free >>= 1;
9011         } else if (index == BTRFS_RAID_RAID1) {
9012                 dev_min = 2;
9013         } else if (index == BTRFS_RAID_DUP) {
9014                 /* Multiply by 2 */
9015                 min_free <<= 1;
9016         } else if (index == BTRFS_RAID_RAID0) {
9017                 dev_min = fs_devices->rw_devices;
9018                 min_free = div64_u64(min_free, dev_min);
9019         }
9020
9021         /* We need to do this so that we can look at pending chunks */
9022         trans = btrfs_join_transaction(root);
9023         if (IS_ERR(trans)) {
9024                 ret = PTR_ERR(trans);
9025                 goto out;
9026         }
9027
9028         mutex_lock(&root->fs_info->chunk_mutex);
9029         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9030                 u64 dev_offset;
9031
9032                 /*
9033                  * check to make sure we can actually find a chunk with enough
9034                  * space to fit our block group in.
9035                  */
9036                 if (device->total_bytes > device->bytes_used + min_free &&
9037                     !device->is_tgtdev_for_dev_replace) {
9038                         ret = find_free_dev_extent(trans, device, min_free,
9039                                                    &dev_offset, NULL);
9040                         if (!ret)
9041                                 dev_nr++;
9042
9043                         if (dev_nr >= dev_min)
9044                                 break;
9045
9046                         ret = -1;
9047                 }
9048         }
9049         mutex_unlock(&root->fs_info->chunk_mutex);
9050         btrfs_end_transaction(trans, root);
9051 out:
9052         btrfs_put_block_group(block_group);
9053         return ret;
9054 }
9055
9056 static int find_first_block_group(struct btrfs_root *root,
9057                 struct btrfs_path *path, struct btrfs_key *key)
9058 {
9059         int ret = 0;
9060         struct btrfs_key found_key;
9061         struct extent_buffer *leaf;
9062         int slot;
9063
9064         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9065         if (ret < 0)
9066                 goto out;
9067
9068         while (1) {
9069                 slot = path->slots[0];
9070                 leaf = path->nodes[0];
9071                 if (slot >= btrfs_header_nritems(leaf)) {
9072                         ret = btrfs_next_leaf(root, path);
9073                         if (ret == 0)
9074                                 continue;
9075                         if (ret < 0)
9076                                 goto out;
9077                         break;
9078                 }
9079                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9080
9081                 if (found_key.objectid >= key->objectid &&
9082                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9083                         ret = 0;
9084                         goto out;
9085                 }
9086                 path->slots[0]++;
9087         }
9088 out:
9089         return ret;
9090 }
9091
9092 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9093 {
9094         struct btrfs_block_group_cache *block_group;
9095         u64 last = 0;
9096
9097         while (1) {
9098                 struct inode *inode;
9099
9100                 block_group = btrfs_lookup_first_block_group(info, last);
9101                 while (block_group) {
9102                         spin_lock(&block_group->lock);
9103                         if (block_group->iref)
9104                                 break;
9105                         spin_unlock(&block_group->lock);
9106                         block_group = next_block_group(info->tree_root,
9107                                                        block_group);
9108                 }
9109                 if (!block_group) {
9110                         if (last == 0)
9111                                 break;
9112                         last = 0;
9113                         continue;
9114                 }
9115
9116                 inode = block_group->inode;
9117                 block_group->iref = 0;
9118                 block_group->inode = NULL;
9119                 spin_unlock(&block_group->lock);
9120                 iput(inode);
9121                 last = block_group->key.objectid + block_group->key.offset;
9122                 btrfs_put_block_group(block_group);
9123         }
9124 }
9125
9126 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9127 {
9128         struct btrfs_block_group_cache *block_group;
9129         struct btrfs_space_info *space_info;
9130         struct btrfs_caching_control *caching_ctl;
9131         struct rb_node *n;
9132
9133         down_write(&info->commit_root_sem);
9134         while (!list_empty(&info->caching_block_groups)) {
9135                 caching_ctl = list_entry(info->caching_block_groups.next,
9136                                          struct btrfs_caching_control, list);
9137                 list_del(&caching_ctl->list);
9138                 put_caching_control(caching_ctl);
9139         }
9140         up_write(&info->commit_root_sem);
9141
9142         spin_lock(&info->unused_bgs_lock);
9143         while (!list_empty(&info->unused_bgs)) {
9144                 block_group = list_first_entry(&info->unused_bgs,
9145                                                struct btrfs_block_group_cache,
9146                                                bg_list);
9147                 list_del_init(&block_group->bg_list);
9148                 btrfs_put_block_group(block_group);
9149         }
9150         spin_unlock(&info->unused_bgs_lock);
9151
9152         spin_lock(&info->block_group_cache_lock);
9153         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9154                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9155                                        cache_node);
9156                 rb_erase(&block_group->cache_node,
9157                          &info->block_group_cache_tree);
9158                 RB_CLEAR_NODE(&block_group->cache_node);
9159                 spin_unlock(&info->block_group_cache_lock);
9160
9161                 down_write(&block_group->space_info->groups_sem);
9162                 list_del(&block_group->list);
9163                 up_write(&block_group->space_info->groups_sem);
9164
9165                 if (block_group->cached == BTRFS_CACHE_STARTED)
9166                         wait_block_group_cache_done(block_group);
9167
9168                 /*
9169                  * We haven't cached this block group, which means we could
9170                  * possibly have excluded extents on this block group.
9171                  */
9172                 if (block_group->cached == BTRFS_CACHE_NO ||
9173                     block_group->cached == BTRFS_CACHE_ERROR)
9174                         free_excluded_extents(info->extent_root, block_group);
9175
9176                 btrfs_remove_free_space_cache(block_group);
9177                 btrfs_put_block_group(block_group);
9178
9179                 spin_lock(&info->block_group_cache_lock);
9180         }
9181         spin_unlock(&info->block_group_cache_lock);
9182
9183         /* now that all the block groups are freed, go through and
9184          * free all the space_info structs.  This is only called during
9185          * the final stages of unmount, and so we know nobody is
9186          * using them.  We call synchronize_rcu() once before we start,
9187          * just to be on the safe side.
9188          */
9189         synchronize_rcu();
9190
9191         release_global_block_rsv(info);
9192
9193         while (!list_empty(&info->space_info)) {
9194                 int i;
9195
9196                 space_info = list_entry(info->space_info.next,
9197                                         struct btrfs_space_info,
9198                                         list);
9199                 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
9200                         if (WARN_ON(space_info->bytes_pinned > 0 ||
9201                             space_info->bytes_reserved > 0 ||
9202                             space_info->bytes_may_use > 0)) {
9203                                 dump_space_info(space_info, 0, 0);
9204                         }
9205                 }
9206                 list_del(&space_info->list);
9207                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9208                         struct kobject *kobj;
9209                         kobj = space_info->block_group_kobjs[i];
9210                         space_info->block_group_kobjs[i] = NULL;
9211                         if (kobj) {
9212                                 kobject_del(kobj);
9213                                 kobject_put(kobj);
9214                         }
9215                 }
9216                 kobject_del(&space_info->kobj);
9217                 kobject_put(&space_info->kobj);
9218         }
9219         return 0;
9220 }
9221
9222 static void __link_block_group(struct btrfs_space_info *space_info,
9223                                struct btrfs_block_group_cache *cache)
9224 {
9225         int index = get_block_group_index(cache);
9226         bool first = false;
9227
9228         down_write(&space_info->groups_sem);
9229         if (list_empty(&space_info->block_groups[index]))
9230                 first = true;
9231         list_add_tail(&cache->list, &space_info->block_groups[index]);
9232         up_write(&space_info->groups_sem);
9233
9234         if (first) {
9235                 struct raid_kobject *rkobj;
9236                 int ret;
9237
9238                 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9239                 if (!rkobj)
9240                         goto out_err;
9241                 rkobj->raid_type = index;
9242                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9243                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9244                                   "%s", get_raid_name(index));
9245                 if (ret) {
9246                         kobject_put(&rkobj->kobj);
9247                         goto out_err;
9248                 }
9249                 space_info->block_group_kobjs[index] = &rkobj->kobj;
9250         }
9251
9252         return;
9253 out_err:
9254         pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
9255 }
9256
9257 static struct btrfs_block_group_cache *
9258 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9259 {
9260         struct btrfs_block_group_cache *cache;
9261
9262         cache = kzalloc(sizeof(*cache), GFP_NOFS);
9263         if (!cache)
9264                 return NULL;
9265
9266         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9267                                         GFP_NOFS);
9268         if (!cache->free_space_ctl) {
9269                 kfree(cache);
9270                 return NULL;
9271         }
9272
9273         cache->key.objectid = start;
9274         cache->key.offset = size;
9275         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9276
9277         cache->sectorsize = root->sectorsize;
9278         cache->fs_info = root->fs_info;
9279         cache->full_stripe_len = btrfs_full_stripe_len(root,
9280                                                &root->fs_info->mapping_tree,
9281                                                start);
9282         atomic_set(&cache->count, 1);
9283         spin_lock_init(&cache->lock);
9284         init_rwsem(&cache->data_rwsem);
9285         INIT_LIST_HEAD(&cache->list);
9286         INIT_LIST_HEAD(&cache->cluster_list);
9287         INIT_LIST_HEAD(&cache->bg_list);
9288         INIT_LIST_HEAD(&cache->ro_list);
9289         INIT_LIST_HEAD(&cache->dirty_list);
9290         INIT_LIST_HEAD(&cache->io_list);
9291         btrfs_init_free_space_ctl(cache);
9292         atomic_set(&cache->trimming, 0);
9293
9294         return cache;
9295 }
9296
9297 int btrfs_read_block_groups(struct btrfs_root *root)
9298 {
9299         struct btrfs_path *path;
9300         int ret;
9301         struct btrfs_block_group_cache *cache;
9302         struct btrfs_fs_info *info = root->fs_info;
9303         struct btrfs_space_info *space_info;
9304         struct btrfs_key key;
9305         struct btrfs_key found_key;
9306         struct extent_buffer *leaf;
9307         int need_clear = 0;
9308         u64 cache_gen;
9309
9310         root = info->extent_root;
9311         key.objectid = 0;
9312         key.offset = 0;
9313         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9314         path = btrfs_alloc_path();
9315         if (!path)
9316                 return -ENOMEM;
9317         path->reada = 1;
9318
9319         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
9320         if (btrfs_test_opt(root, SPACE_CACHE) &&
9321             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
9322                 need_clear = 1;
9323         if (btrfs_test_opt(root, CLEAR_CACHE))
9324                 need_clear = 1;
9325
9326         while (1) {
9327                 ret = find_first_block_group(root, path, &key);
9328                 if (ret > 0)
9329                         break;
9330                 if (ret != 0)
9331                         goto error;
9332
9333                 leaf = path->nodes[0];
9334                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9335
9336                 cache = btrfs_create_block_group_cache(root, found_key.objectid,
9337                                                        found_key.offset);
9338                 if (!cache) {
9339                         ret = -ENOMEM;
9340                         goto error;
9341                 }
9342
9343                 if (need_clear) {
9344                         /*
9345                          * When we mount with old space cache, we need to
9346                          * set BTRFS_DC_CLEAR and set dirty flag.
9347                          *
9348                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9349                          *    truncate the old free space cache inode and
9350                          *    setup a new one.
9351                          * b) Setting 'dirty flag' makes sure that we flush
9352                          *    the new space cache info onto disk.
9353                          */
9354                         if (btrfs_test_opt(root, SPACE_CACHE))
9355                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
9356                 }
9357
9358                 read_extent_buffer(leaf, &cache->item,
9359                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
9360                                    sizeof(cache->item));
9361                 cache->flags = btrfs_block_group_flags(&cache->item);
9362
9363                 key.objectid = found_key.objectid + found_key.offset;
9364                 btrfs_release_path(path);
9365
9366                 /*
9367                  * We need to exclude the super stripes now so that the space
9368                  * info has super bytes accounted for, otherwise we'll think
9369                  * we have more space than we actually do.
9370                  */
9371                 ret = exclude_super_stripes(root, cache);
9372                 if (ret) {
9373                         /*
9374                          * We may have excluded something, so call this just in
9375                          * case.
9376                          */
9377                         free_excluded_extents(root, cache);
9378                         btrfs_put_block_group(cache);
9379                         goto error;
9380                 }
9381
9382                 /*
9383                  * check for two cases, either we are full, and therefore
9384                  * don't need to bother with the caching work since we won't
9385                  * find any space, or we are empty, and we can just add all
9386                  * the space in and be done with it.  This saves us _alot_ of
9387                  * time, particularly in the full case.
9388                  */
9389                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
9390                         cache->last_byte_to_unpin = (u64)-1;
9391                         cache->cached = BTRFS_CACHE_FINISHED;
9392                         free_excluded_extents(root, cache);
9393                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9394                         cache->last_byte_to_unpin = (u64)-1;
9395                         cache->cached = BTRFS_CACHE_FINISHED;
9396                         add_new_free_space(cache, root->fs_info,
9397                                            found_key.objectid,
9398                                            found_key.objectid +
9399                                            found_key.offset);
9400                         free_excluded_extents(root, cache);
9401                 }
9402
9403                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
9404                 if (ret) {
9405                         btrfs_remove_free_space_cache(cache);
9406                         btrfs_put_block_group(cache);
9407                         goto error;
9408                 }
9409
9410                 ret = update_space_info(info, cache->flags, found_key.offset,
9411                                         btrfs_block_group_used(&cache->item),
9412                                         &space_info);
9413                 if (ret) {
9414                         btrfs_remove_free_space_cache(cache);
9415                         spin_lock(&info->block_group_cache_lock);
9416                         rb_erase(&cache->cache_node,
9417                                  &info->block_group_cache_tree);
9418                         RB_CLEAR_NODE(&cache->cache_node);
9419                         spin_unlock(&info->block_group_cache_lock);
9420                         btrfs_put_block_group(cache);
9421                         goto error;
9422                 }
9423
9424                 cache->space_info = space_info;
9425                 spin_lock(&cache->space_info->lock);
9426                 cache->space_info->bytes_readonly += cache->bytes_super;
9427                 spin_unlock(&cache->space_info->lock);
9428
9429                 __link_block_group(space_info, cache);
9430
9431                 set_avail_alloc_bits(root->fs_info, cache->flags);
9432                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9433                         set_block_group_ro(cache, 1);
9434                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9435                         spin_lock(&info->unused_bgs_lock);
9436                         /* Should always be true but just in case. */
9437                         if (list_empty(&cache->bg_list)) {
9438                                 btrfs_get_block_group(cache);
9439                                 list_add_tail(&cache->bg_list,
9440                                               &info->unused_bgs);
9441                         }
9442                         spin_unlock(&info->unused_bgs_lock);
9443                 }
9444         }
9445
9446         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
9447                 if (!(get_alloc_profile(root, space_info->flags) &
9448                       (BTRFS_BLOCK_GROUP_RAID10 |
9449                        BTRFS_BLOCK_GROUP_RAID1 |
9450                        BTRFS_BLOCK_GROUP_RAID5 |
9451                        BTRFS_BLOCK_GROUP_RAID6 |
9452                        BTRFS_BLOCK_GROUP_DUP)))
9453                         continue;
9454                 /*
9455                  * avoid allocating from un-mirrored block group if there are
9456                  * mirrored block groups.
9457                  */
9458                 list_for_each_entry(cache,
9459                                 &space_info->block_groups[BTRFS_RAID_RAID0],
9460                                 list)
9461                         set_block_group_ro(cache, 1);
9462                 list_for_each_entry(cache,
9463                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
9464                                 list)
9465                         set_block_group_ro(cache, 1);
9466         }
9467
9468         init_global_block_rsv(info);
9469         ret = 0;
9470 error:
9471         btrfs_free_path(path);
9472         return ret;
9473 }
9474
9475 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9476                                        struct btrfs_root *root)
9477 {
9478         struct btrfs_block_group_cache *block_group, *tmp;
9479         struct btrfs_root *extent_root = root->fs_info->extent_root;
9480         struct btrfs_block_group_item item;
9481         struct btrfs_key key;
9482         int ret = 0;
9483
9484         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9485                 if (ret)
9486                         goto next;
9487
9488                 spin_lock(&block_group->lock);
9489                 memcpy(&item, &block_group->item, sizeof(item));
9490                 memcpy(&key, &block_group->key, sizeof(key));
9491                 spin_unlock(&block_group->lock);
9492
9493                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
9494                                         sizeof(item));
9495                 if (ret)
9496                         btrfs_abort_transaction(trans, extent_root, ret);
9497                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
9498                                                key.objectid, key.offset);
9499                 if (ret)
9500                         btrfs_abort_transaction(trans, extent_root, ret);
9501 next:
9502                 list_del_init(&block_group->bg_list);
9503         }
9504 }
9505
9506 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9507                            struct btrfs_root *root, u64 bytes_used,
9508                            u64 type, u64 chunk_objectid, u64 chunk_offset,
9509                            u64 size)
9510 {
9511         int ret;
9512         struct btrfs_root *extent_root;
9513         struct btrfs_block_group_cache *cache;
9514
9515         extent_root = root->fs_info->extent_root;
9516
9517         btrfs_set_log_full_commit(root->fs_info, trans);
9518
9519         cache = btrfs_create_block_group_cache(root, chunk_offset, size);
9520         if (!cache)
9521                 return -ENOMEM;
9522
9523         btrfs_set_block_group_used(&cache->item, bytes_used);
9524         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
9525         btrfs_set_block_group_flags(&cache->item, type);
9526
9527         cache->flags = type;
9528         cache->last_byte_to_unpin = (u64)-1;
9529         cache->cached = BTRFS_CACHE_FINISHED;
9530         ret = exclude_super_stripes(root, cache);
9531         if (ret) {
9532                 /*
9533                  * We may have excluded something, so call this just in
9534                  * case.
9535                  */
9536                 free_excluded_extents(root, cache);
9537                 btrfs_put_block_group(cache);
9538                 return ret;
9539         }
9540
9541         add_new_free_space(cache, root->fs_info, chunk_offset,
9542                            chunk_offset + size);
9543
9544         free_excluded_extents(root, cache);
9545
9546         /*
9547          * Call to ensure the corresponding space_info object is created and
9548          * assigned to our block group, but don't update its counters just yet.
9549          * We want our bg to be added to the rbtree with its ->space_info set.
9550          */
9551         ret = update_space_info(root->fs_info, cache->flags, 0, 0,
9552                                 &cache->space_info);
9553         if (ret) {
9554                 btrfs_remove_free_space_cache(cache);
9555                 btrfs_put_block_group(cache);
9556                 return ret;
9557         }
9558
9559         ret = btrfs_add_block_group_cache(root->fs_info, cache);
9560         if (ret) {
9561                 btrfs_remove_free_space_cache(cache);
9562                 btrfs_put_block_group(cache);
9563                 return ret;
9564         }
9565
9566         /*
9567          * Now that our block group has its ->space_info set and is inserted in
9568          * the rbtree, update the space info's counters.
9569          */
9570         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
9571                                 &cache->space_info);
9572         if (ret) {
9573                 btrfs_remove_free_space_cache(cache);
9574                 spin_lock(&root->fs_info->block_group_cache_lock);
9575                 rb_erase(&cache->cache_node,
9576                          &root->fs_info->block_group_cache_tree);
9577                 RB_CLEAR_NODE(&cache->cache_node);
9578                 spin_unlock(&root->fs_info->block_group_cache_lock);
9579                 btrfs_put_block_group(cache);
9580                 return ret;
9581         }
9582         update_global_block_rsv(root->fs_info);
9583
9584         spin_lock(&cache->space_info->lock);
9585         cache->space_info->bytes_readonly += cache->bytes_super;
9586         spin_unlock(&cache->space_info->lock);
9587
9588         __link_block_group(cache->space_info, cache);
9589
9590         list_add_tail(&cache->bg_list, &trans->new_bgs);
9591
9592         set_avail_alloc_bits(extent_root->fs_info, type);
9593
9594         return 0;
9595 }
9596
9597 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9598 {
9599         u64 extra_flags = chunk_to_extended(flags) &
9600                                 BTRFS_EXTENDED_PROFILE_MASK;
9601
9602         write_seqlock(&fs_info->profiles_lock);
9603         if (flags & BTRFS_BLOCK_GROUP_DATA)
9604                 fs_info->avail_data_alloc_bits &= ~extra_flags;
9605         if (flags & BTRFS_BLOCK_GROUP_METADATA)
9606                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
9607         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
9608                 fs_info->avail_system_alloc_bits &= ~extra_flags;
9609         write_sequnlock(&fs_info->profiles_lock);
9610 }
9611
9612 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9613                              struct btrfs_root *root, u64 group_start,
9614                              struct extent_map *em)
9615 {
9616         struct btrfs_path *path;
9617         struct btrfs_block_group_cache *block_group;
9618         struct btrfs_free_cluster *cluster;
9619         struct btrfs_root *tree_root = root->fs_info->tree_root;
9620         struct btrfs_key key;
9621         struct inode *inode;
9622         struct kobject *kobj = NULL;
9623         int ret;
9624         int index;
9625         int factor;
9626         struct btrfs_caching_control *caching_ctl = NULL;
9627         bool remove_em;
9628
9629         root = root->fs_info->extent_root;
9630
9631         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
9632         BUG_ON(!block_group);
9633         BUG_ON(!block_group->ro);
9634
9635         /*
9636          * Free the reserved super bytes from this block group before
9637          * remove it.
9638          */
9639         free_excluded_extents(root, block_group);
9640
9641         memcpy(&key, &block_group->key, sizeof(key));
9642         index = get_block_group_index(block_group);
9643         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
9644                                   BTRFS_BLOCK_GROUP_RAID1 |
9645                                   BTRFS_BLOCK_GROUP_RAID10))
9646                 factor = 2;
9647         else
9648                 factor = 1;
9649
9650         /* make sure this block group isn't part of an allocation cluster */
9651         cluster = &root->fs_info->data_alloc_cluster;
9652         spin_lock(&cluster->refill_lock);
9653         btrfs_return_cluster_to_free_space(block_group, cluster);
9654         spin_unlock(&cluster->refill_lock);
9655
9656         /*
9657          * make sure this block group isn't part of a metadata
9658          * allocation cluster
9659          */
9660         cluster = &root->fs_info->meta_alloc_cluster;
9661         spin_lock(&cluster->refill_lock);
9662         btrfs_return_cluster_to_free_space(block_group, cluster);
9663         spin_unlock(&cluster->refill_lock);
9664
9665         path = btrfs_alloc_path();
9666         if (!path) {
9667                 ret = -ENOMEM;
9668                 goto out;
9669         }
9670
9671         /*
9672          * get the inode first so any iput calls done for the io_list
9673          * aren't the final iput (no unlinks allowed now)
9674          */
9675         inode = lookup_free_space_inode(tree_root, block_group, path);
9676
9677         mutex_lock(&trans->transaction->cache_write_mutex);
9678         /*
9679          * make sure our free spache cache IO is done before remove the
9680          * free space inode
9681          */
9682         spin_lock(&trans->transaction->dirty_bgs_lock);
9683         if (!list_empty(&block_group->io_list)) {
9684                 list_del_init(&block_group->io_list);
9685
9686                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9687
9688                 spin_unlock(&trans->transaction->dirty_bgs_lock);
9689                 btrfs_wait_cache_io(root, trans, block_group,
9690                                     &block_group->io_ctl, path,
9691                                     block_group->key.objectid);
9692                 btrfs_put_block_group(block_group);
9693                 spin_lock(&trans->transaction->dirty_bgs_lock);
9694         }
9695
9696         if (!list_empty(&block_group->dirty_list)) {
9697                 list_del_init(&block_group->dirty_list);
9698                 btrfs_put_block_group(block_group);
9699         }
9700         spin_unlock(&trans->transaction->dirty_bgs_lock);
9701         mutex_unlock(&trans->transaction->cache_write_mutex);
9702
9703         if (!IS_ERR(inode)) {
9704                 ret = btrfs_orphan_add(trans, inode);
9705                 if (ret) {
9706                         btrfs_add_delayed_iput(inode);
9707                         goto out;
9708                 }
9709                 clear_nlink(inode);
9710                 /* One for the block groups ref */
9711                 spin_lock(&block_group->lock);
9712                 if (block_group->iref) {
9713                         block_group->iref = 0;
9714                         block_group->inode = NULL;
9715                         spin_unlock(&block_group->lock);
9716                         iput(inode);
9717                 } else {
9718                         spin_unlock(&block_group->lock);
9719                 }
9720                 /* One for our lookup ref */
9721                 btrfs_add_delayed_iput(inode);
9722         }
9723
9724         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9725         key.offset = block_group->key.objectid;
9726         key.type = 0;
9727
9728         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9729         if (ret < 0)
9730                 goto out;
9731         if (ret > 0)
9732                 btrfs_release_path(path);
9733         if (ret == 0) {
9734                 ret = btrfs_del_item(trans, tree_root, path);
9735                 if (ret)
9736                         goto out;
9737                 btrfs_release_path(path);
9738         }
9739
9740         spin_lock(&root->fs_info->block_group_cache_lock);
9741         rb_erase(&block_group->cache_node,
9742                  &root->fs_info->block_group_cache_tree);
9743         RB_CLEAR_NODE(&block_group->cache_node);
9744
9745         if (root->fs_info->first_logical_byte == block_group->key.objectid)
9746                 root->fs_info->first_logical_byte = (u64)-1;
9747         spin_unlock(&root->fs_info->block_group_cache_lock);
9748
9749         down_write(&block_group->space_info->groups_sem);
9750         /*
9751          * we must use list_del_init so people can check to see if they
9752          * are still on the list after taking the semaphore
9753          */
9754         list_del_init(&block_group->list);
9755         if (list_empty(&block_group->space_info->block_groups[index])) {
9756                 kobj = block_group->space_info->block_group_kobjs[index];
9757                 block_group->space_info->block_group_kobjs[index] = NULL;
9758                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
9759         }
9760         up_write(&block_group->space_info->groups_sem);
9761         if (kobj) {
9762                 kobject_del(kobj);
9763                 kobject_put(kobj);
9764         }
9765
9766         if (block_group->has_caching_ctl)
9767                 caching_ctl = get_caching_control(block_group);
9768         if (block_group->cached == BTRFS_CACHE_STARTED)
9769                 wait_block_group_cache_done(block_group);
9770         if (block_group->has_caching_ctl) {
9771                 down_write(&root->fs_info->commit_root_sem);
9772                 if (!caching_ctl) {
9773                         struct btrfs_caching_control *ctl;
9774
9775                         list_for_each_entry(ctl,
9776                                     &root->fs_info->caching_block_groups, list)
9777                                 if (ctl->block_group == block_group) {
9778                                         caching_ctl = ctl;
9779                                         atomic_inc(&caching_ctl->count);
9780                                         break;
9781                                 }
9782                 }
9783                 if (caching_ctl)
9784                         list_del_init(&caching_ctl->list);
9785                 up_write(&root->fs_info->commit_root_sem);
9786                 if (caching_ctl) {
9787                         /* Once for the caching bgs list and once for us. */
9788                         put_caching_control(caching_ctl);
9789                         put_caching_control(caching_ctl);
9790                 }
9791         }
9792
9793         spin_lock(&trans->transaction->dirty_bgs_lock);
9794         if (!list_empty(&block_group->dirty_list)) {
9795                 WARN_ON(1);
9796         }
9797         if (!list_empty(&block_group->io_list)) {
9798                 WARN_ON(1);
9799         }
9800         spin_unlock(&trans->transaction->dirty_bgs_lock);
9801         btrfs_remove_free_space_cache(block_group);
9802
9803         spin_lock(&block_group->space_info->lock);
9804         list_del_init(&block_group->ro_list);
9805
9806         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
9807                 WARN_ON(block_group->space_info->total_bytes
9808                         < block_group->key.offset);
9809                 WARN_ON(block_group->space_info->bytes_readonly
9810                         < block_group->key.offset);
9811                 WARN_ON(block_group->space_info->disk_total
9812                         < block_group->key.offset * factor);
9813         }
9814         block_group->space_info->total_bytes -= block_group->key.offset;
9815         block_group->space_info->bytes_readonly -= block_group->key.offset;
9816         block_group->space_info->disk_total -= block_group->key.offset * factor;
9817
9818         spin_unlock(&block_group->space_info->lock);
9819
9820         memcpy(&key, &block_group->key, sizeof(key));
9821
9822         lock_chunks(root);
9823         if (!list_empty(&em->list)) {
9824                 /* We're in the transaction->pending_chunks list. */
9825                 free_extent_map(em);
9826         }
9827         spin_lock(&block_group->lock);
9828         block_group->removed = 1;
9829         /*
9830          * At this point trimming can't start on this block group, because we
9831          * removed the block group from the tree fs_info->block_group_cache_tree
9832          * so no one can't find it anymore and even if someone already got this
9833          * block group before we removed it from the rbtree, they have already
9834          * incremented block_group->trimming - if they didn't, they won't find
9835          * any free space entries because we already removed them all when we
9836          * called btrfs_remove_free_space_cache().
9837          *
9838          * And we must not remove the extent map from the fs_info->mapping_tree
9839          * to prevent the same logical address range and physical device space
9840          * ranges from being reused for a new block group. This is because our
9841          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9842          * completely transactionless, so while it is trimming a range the
9843          * currently running transaction might finish and a new one start,
9844          * allowing for new block groups to be created that can reuse the same
9845          * physical device locations unless we take this special care.
9846          */
9847         remove_em = (atomic_read(&block_group->trimming) == 0);
9848         /*
9849          * Make sure a trimmer task always sees the em in the pinned_chunks list
9850          * if it sees block_group->removed == 1 (needs to lock block_group->lock
9851          * before checking block_group->removed).
9852          */
9853         if (!remove_em) {
9854                 /*
9855                  * Our em might be in trans->transaction->pending_chunks which
9856                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9857                  * and so is the fs_info->pinned_chunks list.
9858                  *
9859                  * So at this point we must be holding the chunk_mutex to avoid
9860                  * any races with chunk allocation (more specifically at
9861                  * volumes.c:contains_pending_extent()), to ensure it always
9862                  * sees the em, either in the pending_chunks list or in the
9863                  * pinned_chunks list.
9864                  */
9865                 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9866         }
9867         spin_unlock(&block_group->lock);
9868
9869         if (remove_em) {
9870                 struct extent_map_tree *em_tree;
9871
9872                 em_tree = &root->fs_info->mapping_tree.map_tree;
9873                 write_lock(&em_tree->lock);
9874                 /*
9875                  * The em might be in the pending_chunks list, so make sure the
9876                  * chunk mutex is locked, since remove_extent_mapping() will
9877                  * delete us from that list.
9878                  */
9879                 remove_extent_mapping(em_tree, em);
9880                 write_unlock(&em_tree->lock);
9881                 /* once for the tree */
9882                 free_extent_map(em);
9883         }
9884
9885         unlock_chunks(root);
9886
9887         btrfs_put_block_group(block_group);
9888         btrfs_put_block_group(block_group);
9889
9890         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9891         if (ret > 0)
9892                 ret = -EIO;
9893         if (ret < 0)
9894                 goto out;
9895
9896         ret = btrfs_del_item(trans, root, path);
9897 out:
9898         btrfs_free_path(path);
9899         return ret;
9900 }
9901
9902 /*
9903  * Process the unused_bgs list and remove any that don't have any allocated
9904  * space inside of them.
9905  */
9906 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9907 {
9908         struct btrfs_block_group_cache *block_group;
9909         struct btrfs_space_info *space_info;
9910         struct btrfs_root *root = fs_info->extent_root;
9911         struct btrfs_trans_handle *trans;
9912         int ret = 0;
9913
9914         if (!fs_info->open)
9915                 return;
9916
9917         spin_lock(&fs_info->unused_bgs_lock);
9918         while (!list_empty(&fs_info->unused_bgs)) {
9919                 u64 start, end;
9920
9921                 block_group = list_first_entry(&fs_info->unused_bgs,
9922                                                struct btrfs_block_group_cache,
9923                                                bg_list);
9924                 space_info = block_group->space_info;
9925                 list_del_init(&block_group->bg_list);
9926                 if (ret || btrfs_mixed_space_info(space_info)) {
9927                         btrfs_put_block_group(block_group);
9928                         continue;
9929                 }
9930                 spin_unlock(&fs_info->unused_bgs_lock);
9931
9932                 mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
9933
9934                 /* Don't want to race with allocators so take the groups_sem */
9935                 down_write(&space_info->groups_sem);
9936                 spin_lock(&block_group->lock);
9937                 if (block_group->reserved ||
9938                     btrfs_block_group_used(&block_group->item) ||
9939                     block_group->ro) {
9940                         /*
9941                          * We want to bail if we made new allocations or have
9942                          * outstanding allocations in this block group.  We do
9943                          * the ro check in case balance is currently acting on
9944                          * this block group.
9945                          */
9946                         spin_unlock(&block_group->lock);
9947                         up_write(&space_info->groups_sem);
9948                         goto next;
9949                 }
9950                 spin_unlock(&block_group->lock);
9951
9952                 /* We don't want to force the issue, only flip if it's ok. */
9953                 ret = set_block_group_ro(block_group, 0);
9954                 up_write(&space_info->groups_sem);
9955                 if (ret < 0) {
9956                         ret = 0;
9957                         goto next;
9958                 }
9959
9960                 /*
9961                  * Want to do this before we do anything else so we can recover
9962                  * properly if we fail to join the transaction.
9963                  */
9964                 /* 1 for btrfs_orphan_reserve_metadata() */
9965                 trans = btrfs_start_transaction(root, 1);
9966                 if (IS_ERR(trans)) {
9967                         btrfs_set_block_group_rw(root, block_group);
9968                         ret = PTR_ERR(trans);
9969                         goto next;
9970                 }
9971
9972                 /*
9973                  * We could have pending pinned extents for this block group,
9974                  * just delete them, we don't care about them anymore.
9975                  */
9976                 start = block_group->key.objectid;
9977                 end = start + block_group->key.offset - 1;
9978                 /*
9979                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
9980                  * btrfs_finish_extent_commit(). If we are at transaction N,
9981                  * another task might be running finish_extent_commit() for the
9982                  * previous transaction N - 1, and have seen a range belonging
9983                  * to the block group in freed_extents[] before we were able to
9984                  * clear the whole block group range from freed_extents[]. This
9985                  * means that task can lookup for the block group after we
9986                  * unpinned it from freed_extents[] and removed it, leading to
9987                  * a BUG_ON() at btrfs_unpin_extent_range().
9988                  */
9989                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
9990                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
9991                                   EXTENT_DIRTY, GFP_NOFS);
9992                 if (ret) {
9993                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9994                         btrfs_set_block_group_rw(root, block_group);
9995                         goto end_trans;
9996                 }
9997                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
9998                                   EXTENT_DIRTY, GFP_NOFS);
9999                 if (ret) {
10000                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10001                         btrfs_set_block_group_rw(root, block_group);
10002                         goto end_trans;
10003                 }
10004                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10005
10006                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10007                 spin_lock(&space_info->lock);
10008                 spin_lock(&block_group->lock);
10009
10010                 space_info->bytes_pinned -= block_group->pinned;
10011                 space_info->bytes_readonly += block_group->pinned;
10012                 percpu_counter_add(&space_info->total_bytes_pinned,
10013                                    -block_group->pinned);
10014                 block_group->pinned = 0;
10015
10016                 spin_unlock(&block_group->lock);
10017                 spin_unlock(&space_info->lock);
10018
10019                 /*
10020                  * Btrfs_remove_chunk will abort the transaction if things go
10021                  * horribly wrong.
10022                  */
10023                 ret = btrfs_remove_chunk(trans, root,
10024                                          block_group->key.objectid);
10025 end_trans:
10026                 btrfs_end_transaction(trans, root);
10027 next:
10028                 mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
10029                 btrfs_put_block_group(block_group);
10030                 spin_lock(&fs_info->unused_bgs_lock);
10031         }
10032         spin_unlock(&fs_info->unused_bgs_lock);
10033 }
10034
10035 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10036 {
10037         struct btrfs_space_info *space_info;
10038         struct btrfs_super_block *disk_super;
10039         u64 features;
10040         u64 flags;
10041         int mixed = 0;
10042         int ret;
10043
10044         disk_super = fs_info->super_copy;
10045         if (!btrfs_super_root(disk_super))
10046                 return 1;
10047
10048         features = btrfs_super_incompat_flags(disk_super);
10049         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10050                 mixed = 1;
10051
10052         flags = BTRFS_BLOCK_GROUP_SYSTEM;
10053         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10054         if (ret)
10055                 goto out;
10056
10057         if (mixed) {
10058                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10059                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10060         } else {
10061                 flags = BTRFS_BLOCK_GROUP_METADATA;
10062                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10063                 if (ret)
10064                         goto out;
10065
10066                 flags = BTRFS_BLOCK_GROUP_DATA;
10067                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10068         }
10069 out:
10070         return ret;
10071 }
10072
10073 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
10074 {
10075         return unpin_extent_range(root, start, end, false);
10076 }
10077
10078 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
10079 {
10080         struct btrfs_fs_info *fs_info = root->fs_info;
10081         struct btrfs_block_group_cache *cache = NULL;
10082         u64 group_trimmed;
10083         u64 start;
10084         u64 end;
10085         u64 trimmed = 0;
10086         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
10087         int ret = 0;
10088
10089         /*
10090          * try to trim all FS space, our block group may start from non-zero.
10091          */
10092         if (range->len == total_bytes)
10093                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10094         else
10095                 cache = btrfs_lookup_block_group(fs_info, range->start);
10096
10097         while (cache) {
10098                 if (cache->key.objectid >= (range->start + range->len)) {
10099                         btrfs_put_block_group(cache);
10100                         break;
10101                 }
10102
10103                 start = max(range->start, cache->key.objectid);
10104                 end = min(range->start + range->len,
10105                                 cache->key.objectid + cache->key.offset);
10106
10107                 if (end - start >= range->minlen) {
10108                         if (!block_group_cache_done(cache)) {
10109                                 ret = cache_block_group(cache, 0);
10110                                 if (ret) {
10111                                         btrfs_put_block_group(cache);
10112                                         break;
10113                                 }
10114                                 ret = wait_block_group_cache_done(cache);
10115                                 if (ret) {
10116                                         btrfs_put_block_group(cache);
10117                                         break;
10118                                 }
10119                         }
10120                         ret = btrfs_trim_block_group(cache,
10121                                                      &group_trimmed,
10122                                                      start,
10123                                                      end,
10124                                                      range->minlen);
10125
10126                         trimmed += group_trimmed;
10127                         if (ret) {
10128                                 btrfs_put_block_group(cache);
10129                                 break;
10130                         }
10131                 }
10132
10133                 cache = next_block_group(fs_info->tree_root, cache);
10134         }
10135
10136         range->len = trimmed;
10137         return ret;
10138 }
10139
10140 /*
10141  * btrfs_{start,end}_write_no_snapshoting() are similar to
10142  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10143  * data into the page cache through nocow before the subvolume is snapshoted,
10144  * but flush the data into disk after the snapshot creation, or to prevent
10145  * operations while snapshoting is ongoing and that cause the snapshot to be
10146  * inconsistent (writes followed by expanding truncates for example).
10147  */
10148 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
10149 {
10150         percpu_counter_dec(&root->subv_writers->counter);
10151         /*
10152          * Make sure counter is updated before we wake up
10153          * waiters.
10154          */
10155         smp_mb();
10156         if (waitqueue_active(&root->subv_writers->wait))
10157                 wake_up(&root->subv_writers->wait);
10158 }
10159
10160 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
10161 {
10162         if (atomic_read(&root->will_be_snapshoted))
10163                 return 0;
10164
10165         percpu_counter_inc(&root->subv_writers->counter);
10166         /*
10167          * Make sure counter is updated before we check for snapshot creation.
10168          */
10169         smp_mb();
10170         if (atomic_read(&root->will_be_snapshoted)) {
10171                 btrfs_end_write_no_snapshoting(root);
10172                 return 0;
10173         }
10174         return 1;
10175 }