Btrfs: add snapshot/subvolume destroy ioctl
[cascardo/linux.git] / fs / btrfs / transaction.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/sched.h>
21 #include <linux/writeback.h>
22 #include <linux/pagemap.h>
23 #include <linux/blkdev.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "locking.h"
28 #include "tree-log.h"
29
30 #define BTRFS_ROOT_TRANS_TAG 0
31
32 static noinline void put_transaction(struct btrfs_transaction *transaction)
33 {
34         WARN_ON(transaction->use_count == 0);
35         transaction->use_count--;
36         if (transaction->use_count == 0) {
37                 list_del_init(&transaction->list);
38                 memset(transaction, 0, sizeof(*transaction));
39                 kmem_cache_free(btrfs_transaction_cachep, transaction);
40         }
41 }
42
43 static noinline void switch_commit_root(struct btrfs_root *root)
44 {
45         free_extent_buffer(root->commit_root);
46         root->commit_root = btrfs_root_node(root);
47 }
48
49 /*
50  * either allocate a new transaction or hop into the existing one
51  */
52 static noinline int join_transaction(struct btrfs_root *root)
53 {
54         struct btrfs_transaction *cur_trans;
55         cur_trans = root->fs_info->running_transaction;
56         if (!cur_trans) {
57                 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
58                                              GFP_NOFS);
59                 BUG_ON(!cur_trans);
60                 root->fs_info->generation++;
61                 cur_trans->num_writers = 1;
62                 cur_trans->num_joined = 0;
63                 cur_trans->transid = root->fs_info->generation;
64                 init_waitqueue_head(&cur_trans->writer_wait);
65                 init_waitqueue_head(&cur_trans->commit_wait);
66                 cur_trans->in_commit = 0;
67                 cur_trans->blocked = 0;
68                 cur_trans->use_count = 1;
69                 cur_trans->commit_done = 0;
70                 cur_trans->start_time = get_seconds();
71
72                 cur_trans->delayed_refs.root.rb_node = NULL;
73                 cur_trans->delayed_refs.num_entries = 0;
74                 cur_trans->delayed_refs.num_heads_ready = 0;
75                 cur_trans->delayed_refs.num_heads = 0;
76                 cur_trans->delayed_refs.flushing = 0;
77                 cur_trans->delayed_refs.run_delayed_start = 0;
78                 spin_lock_init(&cur_trans->delayed_refs.lock);
79
80                 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
81                 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
82                 extent_io_tree_init(&cur_trans->dirty_pages,
83                                      root->fs_info->btree_inode->i_mapping,
84                                      GFP_NOFS);
85                 spin_lock(&root->fs_info->new_trans_lock);
86                 root->fs_info->running_transaction = cur_trans;
87                 spin_unlock(&root->fs_info->new_trans_lock);
88         } else {
89                 cur_trans->num_writers++;
90                 cur_trans->num_joined++;
91         }
92
93         return 0;
94 }
95
96 /*
97  * this does all the record keeping required to make sure that a reference
98  * counted root is properly recorded in a given transaction.  This is required
99  * to make sure the old root from before we joined the transaction is deleted
100  * when the transaction commits
101  */
102 static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
103                                          struct btrfs_root *root)
104 {
105         if (root->ref_cows && root->last_trans < trans->transid) {
106                 WARN_ON(root == root->fs_info->extent_root);
107                 WARN_ON(root->commit_root != root->node);
108
109                 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
110                            (unsigned long)root->root_key.objectid,
111                            BTRFS_ROOT_TRANS_TAG);
112                 root->last_trans = trans->transid;
113                 btrfs_init_reloc_root(trans, root);
114         }
115         return 0;
116 }
117
118 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
119                                struct btrfs_root *root)
120 {
121         if (!root->ref_cows)
122                 return 0;
123
124         mutex_lock(&root->fs_info->trans_mutex);
125         if (root->last_trans == trans->transid) {
126                 mutex_unlock(&root->fs_info->trans_mutex);
127                 return 0;
128         }
129
130         record_root_in_trans(trans, root);
131         mutex_unlock(&root->fs_info->trans_mutex);
132         return 0;
133 }
134
135 /* wait for commit against the current transaction to become unblocked
136  * when this is done, it is safe to start a new transaction, but the current
137  * transaction might not be fully on disk.
138  */
139 static void wait_current_trans(struct btrfs_root *root)
140 {
141         struct btrfs_transaction *cur_trans;
142
143         cur_trans = root->fs_info->running_transaction;
144         if (cur_trans && cur_trans->blocked) {
145                 DEFINE_WAIT(wait);
146                 cur_trans->use_count++;
147                 while (1) {
148                         prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149                                         TASK_UNINTERRUPTIBLE);
150                         if (cur_trans->blocked) {
151                                 mutex_unlock(&root->fs_info->trans_mutex);
152                                 schedule();
153                                 mutex_lock(&root->fs_info->trans_mutex);
154                                 finish_wait(&root->fs_info->transaction_wait,
155                                             &wait);
156                         } else {
157                                 finish_wait(&root->fs_info->transaction_wait,
158                                             &wait);
159                                 break;
160                         }
161                 }
162                 put_transaction(cur_trans);
163         }
164 }
165
166 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
167                                              int num_blocks, int wait)
168 {
169         struct btrfs_trans_handle *h =
170                 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
171         int ret;
172
173         mutex_lock(&root->fs_info->trans_mutex);
174         if (!root->fs_info->log_root_recovering &&
175             ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
176                 wait_current_trans(root);
177         ret = join_transaction(root);
178         BUG_ON(ret);
179
180         h->transid = root->fs_info->running_transaction->transid;
181         h->transaction = root->fs_info->running_transaction;
182         h->blocks_reserved = num_blocks;
183         h->blocks_used = 0;
184         h->block_group = 0;
185         h->alloc_exclude_nr = 0;
186         h->alloc_exclude_start = 0;
187         h->delayed_ref_updates = 0;
188
189         root->fs_info->running_transaction->use_count++;
190         record_root_in_trans(h, root);
191         mutex_unlock(&root->fs_info->trans_mutex);
192         return h;
193 }
194
195 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
196                                                    int num_blocks)
197 {
198         return start_transaction(root, num_blocks, 1);
199 }
200 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
201                                                    int num_blocks)
202 {
203         return start_transaction(root, num_blocks, 0);
204 }
205
206 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
207                                                          int num_blocks)
208 {
209         return start_transaction(r, num_blocks, 2);
210 }
211
212 /* wait for a transaction commit to be fully complete */
213 static noinline int wait_for_commit(struct btrfs_root *root,
214                                     struct btrfs_transaction *commit)
215 {
216         DEFINE_WAIT(wait);
217         mutex_lock(&root->fs_info->trans_mutex);
218         while (!commit->commit_done) {
219                 prepare_to_wait(&commit->commit_wait, &wait,
220                                 TASK_UNINTERRUPTIBLE);
221                 if (commit->commit_done)
222                         break;
223                 mutex_unlock(&root->fs_info->trans_mutex);
224                 schedule();
225                 mutex_lock(&root->fs_info->trans_mutex);
226         }
227         mutex_unlock(&root->fs_info->trans_mutex);
228         finish_wait(&commit->commit_wait, &wait);
229         return 0;
230 }
231
232 #if 0
233 /*
234  * rate limit against the drop_snapshot code.  This helps to slow down new
235  * operations if the drop_snapshot code isn't able to keep up.
236  */
237 static void throttle_on_drops(struct btrfs_root *root)
238 {
239         struct btrfs_fs_info *info = root->fs_info;
240         int harder_count = 0;
241
242 harder:
243         if (atomic_read(&info->throttles)) {
244                 DEFINE_WAIT(wait);
245                 int thr;
246                 thr = atomic_read(&info->throttle_gen);
247
248                 do {
249                         prepare_to_wait(&info->transaction_throttle,
250                                         &wait, TASK_UNINTERRUPTIBLE);
251                         if (!atomic_read(&info->throttles)) {
252                                 finish_wait(&info->transaction_throttle, &wait);
253                                 break;
254                         }
255                         schedule();
256                         finish_wait(&info->transaction_throttle, &wait);
257                 } while (thr == atomic_read(&info->throttle_gen));
258                 harder_count++;
259
260                 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
261                     harder_count < 2)
262                         goto harder;
263
264                 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
265                     harder_count < 10)
266                         goto harder;
267
268                 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
269                     harder_count < 20)
270                         goto harder;
271         }
272 }
273 #endif
274
275 void btrfs_throttle(struct btrfs_root *root)
276 {
277         mutex_lock(&root->fs_info->trans_mutex);
278         if (!root->fs_info->open_ioctl_trans)
279                 wait_current_trans(root);
280         mutex_unlock(&root->fs_info->trans_mutex);
281 }
282
283 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
284                           struct btrfs_root *root, int throttle)
285 {
286         struct btrfs_transaction *cur_trans;
287         struct btrfs_fs_info *info = root->fs_info;
288         int count = 0;
289
290         while (count < 4) {
291                 unsigned long cur = trans->delayed_ref_updates;
292                 trans->delayed_ref_updates = 0;
293                 if (cur &&
294                     trans->transaction->delayed_refs.num_heads_ready > 64) {
295                         trans->delayed_ref_updates = 0;
296
297                         /*
298                          * do a full flush if the transaction is trying
299                          * to close
300                          */
301                         if (trans->transaction->delayed_refs.flushing)
302                                 cur = 0;
303                         btrfs_run_delayed_refs(trans, root, cur);
304                 } else {
305                         break;
306                 }
307                 count++;
308         }
309
310         mutex_lock(&info->trans_mutex);
311         cur_trans = info->running_transaction;
312         WARN_ON(cur_trans != trans->transaction);
313         WARN_ON(cur_trans->num_writers < 1);
314         cur_trans->num_writers--;
315
316         if (waitqueue_active(&cur_trans->writer_wait))
317                 wake_up(&cur_trans->writer_wait);
318         put_transaction(cur_trans);
319         mutex_unlock(&info->trans_mutex);
320         memset(trans, 0, sizeof(*trans));
321         kmem_cache_free(btrfs_trans_handle_cachep, trans);
322
323         return 0;
324 }
325
326 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
327                           struct btrfs_root *root)
328 {
329         return __btrfs_end_transaction(trans, root, 0);
330 }
331
332 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
333                                    struct btrfs_root *root)
334 {
335         return __btrfs_end_transaction(trans, root, 1);
336 }
337
338 /*
339  * when btree blocks are allocated, they have some corresponding bits set for
340  * them in one of two extent_io trees.  This is used to make sure all of
341  * those extents are on disk for transaction or log commit
342  */
343 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
344                                         struct extent_io_tree *dirty_pages)
345 {
346         int ret;
347         int err = 0;
348         int werr = 0;
349         struct page *page;
350         struct inode *btree_inode = root->fs_info->btree_inode;
351         u64 start = 0;
352         u64 end;
353         unsigned long index;
354
355         while (1) {
356                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
357                                             EXTENT_DIRTY);
358                 if (ret)
359                         break;
360                 while (start <= end) {
361                         cond_resched();
362
363                         index = start >> PAGE_CACHE_SHIFT;
364                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
365                         page = find_get_page(btree_inode->i_mapping, index);
366                         if (!page)
367                                 continue;
368
369                         btree_lock_page_hook(page);
370                         if (!page->mapping) {
371                                 unlock_page(page);
372                                 page_cache_release(page);
373                                 continue;
374                         }
375
376                         if (PageWriteback(page)) {
377                                 if (PageDirty(page))
378                                         wait_on_page_writeback(page);
379                                 else {
380                                         unlock_page(page);
381                                         page_cache_release(page);
382                                         continue;
383                                 }
384                         }
385                         err = write_one_page(page, 0);
386                         if (err)
387                                 werr = err;
388                         page_cache_release(page);
389                 }
390         }
391         while (1) {
392                 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
393                                             EXTENT_DIRTY);
394                 if (ret)
395                         break;
396
397                 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
398                 while (start <= end) {
399                         index = start >> PAGE_CACHE_SHIFT;
400                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
401                         page = find_get_page(btree_inode->i_mapping, index);
402                         if (!page)
403                                 continue;
404                         if (PageDirty(page)) {
405                                 btree_lock_page_hook(page);
406                                 wait_on_page_writeback(page);
407                                 err = write_one_page(page, 0);
408                                 if (err)
409                                         werr = err;
410                         }
411                         wait_on_page_writeback(page);
412                         page_cache_release(page);
413                         cond_resched();
414                 }
415         }
416         if (err)
417                 werr = err;
418         return werr;
419 }
420
421 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
422                                      struct btrfs_root *root)
423 {
424         if (!trans || !trans->transaction) {
425                 struct inode *btree_inode;
426                 btree_inode = root->fs_info->btree_inode;
427                 return filemap_write_and_wait(btree_inode->i_mapping);
428         }
429         return btrfs_write_and_wait_marked_extents(root,
430                                            &trans->transaction->dirty_pages);
431 }
432
433 /*
434  * this is used to update the root pointer in the tree of tree roots.
435  *
436  * But, in the case of the extent allocation tree, updating the root
437  * pointer may allocate blocks which may change the root of the extent
438  * allocation tree.
439  *
440  * So, this loops and repeats and makes sure the cowonly root didn't
441  * change while the root pointer was being updated in the metadata.
442  */
443 static int update_cowonly_root(struct btrfs_trans_handle *trans,
444                                struct btrfs_root *root)
445 {
446         int ret;
447         u64 old_root_bytenr;
448         struct btrfs_root *tree_root = root->fs_info->tree_root;
449
450         btrfs_write_dirty_block_groups(trans, root);
451
452         while (1) {
453                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
454                 if (old_root_bytenr == root->node->start)
455                         break;
456
457                 btrfs_set_root_node(&root->root_item, root->node);
458                 ret = btrfs_update_root(trans, tree_root,
459                                         &root->root_key,
460                                         &root->root_item);
461                 BUG_ON(ret);
462
463                 ret = btrfs_write_dirty_block_groups(trans, root);
464                 BUG_ON(ret);
465         }
466
467         if (root != root->fs_info->extent_root)
468                 switch_commit_root(root);
469
470         return 0;
471 }
472
473 /*
474  * update all the cowonly tree roots on disk
475  */
476 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
477                                          struct btrfs_root *root)
478 {
479         struct btrfs_fs_info *fs_info = root->fs_info;
480         struct list_head *next;
481         struct extent_buffer *eb;
482         int ret;
483
484         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
485         BUG_ON(ret);
486
487         eb = btrfs_lock_root_node(fs_info->tree_root);
488         btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
489         btrfs_tree_unlock(eb);
490         free_extent_buffer(eb);
491
492         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
493         BUG_ON(ret);
494
495         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
496                 next = fs_info->dirty_cowonly_roots.next;
497                 list_del_init(next);
498                 root = list_entry(next, struct btrfs_root, dirty_list);
499
500                 update_cowonly_root(trans, root);
501         }
502
503         down_write(&fs_info->extent_commit_sem);
504         switch_commit_root(fs_info->extent_root);
505         up_write(&fs_info->extent_commit_sem);
506
507         return 0;
508 }
509
510 /*
511  * dead roots are old snapshots that need to be deleted.  This allocates
512  * a dirty root struct and adds it into the list of dead roots that need to
513  * be deleted
514  */
515 int btrfs_add_dead_root(struct btrfs_root *root)
516 {
517         mutex_lock(&root->fs_info->trans_mutex);
518         list_add(&root->root_list, &root->fs_info->dead_roots);
519         mutex_unlock(&root->fs_info->trans_mutex);
520         return 0;
521 }
522
523 /*
524  * update all the cowonly tree roots on disk
525  */
526 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
527                                     struct btrfs_root *root)
528 {
529         struct btrfs_root *gang[8];
530         struct btrfs_fs_info *fs_info = root->fs_info;
531         int i;
532         int ret;
533         int err = 0;
534
535         while (1) {
536                 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
537                                                  (void **)gang, 0,
538                                                  ARRAY_SIZE(gang),
539                                                  BTRFS_ROOT_TRANS_TAG);
540                 if (ret == 0)
541                         break;
542                 for (i = 0; i < ret; i++) {
543                         root = gang[i];
544                         radix_tree_tag_clear(&fs_info->fs_roots_radix,
545                                         (unsigned long)root->root_key.objectid,
546                                         BTRFS_ROOT_TRANS_TAG);
547
548                         btrfs_free_log(trans, root);
549                         btrfs_update_reloc_root(trans, root);
550
551                         if (root->commit_root != root->node) {
552                                 switch_commit_root(root);
553                                 btrfs_set_root_node(&root->root_item,
554                                                     root->node);
555                         }
556
557                         err = btrfs_update_root(trans, fs_info->tree_root,
558                                                 &root->root_key,
559                                                 &root->root_item);
560                         if (err)
561                                 break;
562                 }
563         }
564         return err;
565 }
566
567 /*
568  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
569  * otherwise every leaf in the btree is read and defragged.
570  */
571 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
572 {
573         struct btrfs_fs_info *info = root->fs_info;
574         int ret;
575         struct btrfs_trans_handle *trans;
576         unsigned long nr;
577
578         smp_mb();
579         if (root->defrag_running)
580                 return 0;
581         trans = btrfs_start_transaction(root, 1);
582         while (1) {
583                 root->defrag_running = 1;
584                 ret = btrfs_defrag_leaves(trans, root, cacheonly);
585                 nr = trans->blocks_used;
586                 btrfs_end_transaction(trans, root);
587                 btrfs_btree_balance_dirty(info->tree_root, nr);
588                 cond_resched();
589
590                 trans = btrfs_start_transaction(root, 1);
591                 if (root->fs_info->closing || ret != -EAGAIN)
592                         break;
593         }
594         root->defrag_running = 0;
595         smp_mb();
596         btrfs_end_transaction(trans, root);
597         return 0;
598 }
599
600 #if 0
601 /*
602  * when dropping snapshots, we generate a ton of delayed refs, and it makes
603  * sense not to join the transaction while it is trying to flush the current
604  * queue of delayed refs out.
605  *
606  * This is used by the drop snapshot code only
607  */
608 static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
609 {
610         DEFINE_WAIT(wait);
611
612         mutex_lock(&info->trans_mutex);
613         while (info->running_transaction &&
614                info->running_transaction->delayed_refs.flushing) {
615                 prepare_to_wait(&info->transaction_wait, &wait,
616                                 TASK_UNINTERRUPTIBLE);
617                 mutex_unlock(&info->trans_mutex);
618
619                 schedule();
620
621                 mutex_lock(&info->trans_mutex);
622                 finish_wait(&info->transaction_wait, &wait);
623         }
624         mutex_unlock(&info->trans_mutex);
625         return 0;
626 }
627
628 /*
629  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
630  * all of them
631  */
632 int btrfs_drop_dead_root(struct btrfs_root *root)
633 {
634         struct btrfs_trans_handle *trans;
635         struct btrfs_root *tree_root = root->fs_info->tree_root;
636         unsigned long nr;
637         int ret;
638
639         while (1) {
640                 /*
641                  * we don't want to jump in and create a bunch of
642                  * delayed refs if the transaction is starting to close
643                  */
644                 wait_transaction_pre_flush(tree_root->fs_info);
645                 trans = btrfs_start_transaction(tree_root, 1);
646
647                 /*
648                  * we've joined a transaction, make sure it isn't
649                  * closing right now
650                  */
651                 if (trans->transaction->delayed_refs.flushing) {
652                         btrfs_end_transaction(trans, tree_root);
653                         continue;
654                 }
655
656                 ret = btrfs_drop_snapshot(trans, root);
657                 if (ret != -EAGAIN)
658                         break;
659
660                 ret = btrfs_update_root(trans, tree_root,
661                                         &root->root_key,
662                                         &root->root_item);
663                 if (ret)
664                         break;
665
666                 nr = trans->blocks_used;
667                 ret = btrfs_end_transaction(trans, tree_root);
668                 BUG_ON(ret);
669
670                 btrfs_btree_balance_dirty(tree_root, nr);
671                 cond_resched();
672         }
673         BUG_ON(ret);
674
675         ret = btrfs_del_root(trans, tree_root, &root->root_key);
676         BUG_ON(ret);
677
678         nr = trans->blocks_used;
679         ret = btrfs_end_transaction(trans, tree_root);
680         BUG_ON(ret);
681
682         free_extent_buffer(root->node);
683         free_extent_buffer(root->commit_root);
684         kfree(root);
685
686         btrfs_btree_balance_dirty(tree_root, nr);
687         return ret;
688 }
689 #endif
690
691 /*
692  * new snapshots need to be created at a very specific time in the
693  * transaction commit.  This does the actual creation
694  */
695 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
696                                    struct btrfs_fs_info *fs_info,
697                                    struct btrfs_pending_snapshot *pending)
698 {
699         struct btrfs_key key;
700         struct btrfs_root_item *new_root_item;
701         struct btrfs_root *tree_root = fs_info->tree_root;
702         struct btrfs_root *root = pending->root;
703         struct extent_buffer *tmp;
704         struct extent_buffer *old;
705         int ret;
706         u64 objectid;
707
708         new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
709         if (!new_root_item) {
710                 ret = -ENOMEM;
711                 goto fail;
712         }
713         ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
714         if (ret)
715                 goto fail;
716
717         record_root_in_trans(trans, root);
718         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
719         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
720
721         key.objectid = objectid;
722         /* record when the snapshot was created in key.offset */
723         key.offset = trans->transid;
724         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
725
726         old = btrfs_lock_root_node(root);
727         btrfs_cow_block(trans, root, old, NULL, 0, &old);
728         btrfs_set_lock_blocking(old);
729
730         btrfs_copy_root(trans, root, old, &tmp, objectid);
731         btrfs_tree_unlock(old);
732         free_extent_buffer(old);
733
734         btrfs_set_root_node(new_root_item, tmp);
735         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
736                                 new_root_item);
737         btrfs_tree_unlock(tmp);
738         free_extent_buffer(tmp);
739         if (ret)
740                 goto fail;
741
742         key.offset = (u64)-1;
743         memcpy(&pending->root_key, &key, sizeof(key));
744 fail:
745         kfree(new_root_item);
746         return ret;
747 }
748
749 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
750                                    struct btrfs_pending_snapshot *pending)
751 {
752         int ret;
753         int namelen;
754         u64 index = 0;
755         struct btrfs_trans_handle *trans;
756         struct inode *parent_inode;
757         struct inode *inode;
758         struct btrfs_root *parent_root;
759
760         parent_inode = pending->dentry->d_parent->d_inode;
761         parent_root = BTRFS_I(parent_inode)->root;
762         trans = btrfs_join_transaction(parent_root, 1);
763
764         /*
765          * insert the directory item
766          */
767         namelen = strlen(pending->name);
768         ret = btrfs_set_inode_index(parent_inode, &index);
769         ret = btrfs_insert_dir_item(trans, parent_root,
770                             pending->name, namelen,
771                             parent_inode->i_ino,
772                             &pending->root_key, BTRFS_FT_DIR, index);
773
774         if (ret)
775                 goto fail;
776
777         btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
778         ret = btrfs_update_inode(trans, parent_root, parent_inode);
779         BUG_ON(ret);
780
781         ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
782                                  pending->root_key.objectid,
783                                  parent_root->root_key.objectid,
784                                  parent_inode->i_ino, index, pending->name,
785                                  namelen);
786
787         BUG_ON(ret);
788
789         inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
790         d_instantiate(pending->dentry, inode);
791 fail:
792         btrfs_end_transaction(trans, fs_info->fs_root);
793         return ret;
794 }
795
796 /*
797  * create all the snapshots we've scheduled for creation
798  */
799 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
800                                              struct btrfs_fs_info *fs_info)
801 {
802         struct btrfs_pending_snapshot *pending;
803         struct list_head *head = &trans->transaction->pending_snapshots;
804         int ret;
805
806         list_for_each_entry(pending, head, list) {
807                 ret = create_pending_snapshot(trans, fs_info, pending);
808                 BUG_ON(ret);
809         }
810         return 0;
811 }
812
813 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
814                                              struct btrfs_fs_info *fs_info)
815 {
816         struct btrfs_pending_snapshot *pending;
817         struct list_head *head = &trans->transaction->pending_snapshots;
818         int ret;
819
820         while (!list_empty(head)) {
821                 pending = list_entry(head->next,
822                                      struct btrfs_pending_snapshot, list);
823                 ret = finish_pending_snapshot(fs_info, pending);
824                 BUG_ON(ret);
825                 list_del(&pending->list);
826                 kfree(pending->name);
827                 kfree(pending);
828         }
829         return 0;
830 }
831
832 static void update_super_roots(struct btrfs_root *root)
833 {
834         struct btrfs_root_item *root_item;
835         struct btrfs_super_block *super;
836
837         super = &root->fs_info->super_copy;
838
839         root_item = &root->fs_info->chunk_root->root_item;
840         super->chunk_root = root_item->bytenr;
841         super->chunk_root_generation = root_item->generation;
842         super->chunk_root_level = root_item->level;
843
844         root_item = &root->fs_info->tree_root->root_item;
845         super->root = root_item->bytenr;
846         super->generation = root_item->generation;
847         super->root_level = root_item->level;
848 }
849
850 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
851 {
852         int ret = 0;
853         spin_lock(&info->new_trans_lock);
854         if (info->running_transaction)
855                 ret = info->running_transaction->in_commit;
856         spin_unlock(&info->new_trans_lock);
857         return ret;
858 }
859
860 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
861                              struct btrfs_root *root)
862 {
863         unsigned long joined = 0;
864         unsigned long timeout = 1;
865         struct btrfs_transaction *cur_trans;
866         struct btrfs_transaction *prev_trans = NULL;
867         DEFINE_WAIT(wait);
868         int ret;
869         int should_grow = 0;
870         unsigned long now = get_seconds();
871         int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
872
873         btrfs_run_ordered_operations(root, 0);
874
875         /* make a pass through all the delayed refs we have so far
876          * any runnings procs may add more while we are here
877          */
878         ret = btrfs_run_delayed_refs(trans, root, 0);
879         BUG_ON(ret);
880
881         cur_trans = trans->transaction;
882         /*
883          * set the flushing flag so procs in this transaction have to
884          * start sending their work down.
885          */
886         cur_trans->delayed_refs.flushing = 1;
887
888         ret = btrfs_run_delayed_refs(trans, root, 0);
889         BUG_ON(ret);
890
891         mutex_lock(&root->fs_info->trans_mutex);
892         if (cur_trans->in_commit) {
893                 cur_trans->use_count++;
894                 mutex_unlock(&root->fs_info->trans_mutex);
895                 btrfs_end_transaction(trans, root);
896
897                 ret = wait_for_commit(root, cur_trans);
898                 BUG_ON(ret);
899
900                 mutex_lock(&root->fs_info->trans_mutex);
901                 put_transaction(cur_trans);
902                 mutex_unlock(&root->fs_info->trans_mutex);
903
904                 return 0;
905         }
906
907         trans->transaction->in_commit = 1;
908         trans->transaction->blocked = 1;
909         if (cur_trans->list.prev != &root->fs_info->trans_list) {
910                 prev_trans = list_entry(cur_trans->list.prev,
911                                         struct btrfs_transaction, list);
912                 if (!prev_trans->commit_done) {
913                         prev_trans->use_count++;
914                         mutex_unlock(&root->fs_info->trans_mutex);
915
916                         wait_for_commit(root, prev_trans);
917
918                         mutex_lock(&root->fs_info->trans_mutex);
919                         put_transaction(prev_trans);
920                 }
921         }
922
923         if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
924                 should_grow = 1;
925
926         do {
927                 int snap_pending = 0;
928                 joined = cur_trans->num_joined;
929                 if (!list_empty(&trans->transaction->pending_snapshots))
930                         snap_pending = 1;
931
932                 WARN_ON(cur_trans != trans->transaction);
933                 prepare_to_wait(&cur_trans->writer_wait, &wait,
934                                 TASK_UNINTERRUPTIBLE);
935
936                 if (cur_trans->num_writers > 1)
937                         timeout = MAX_SCHEDULE_TIMEOUT;
938                 else if (should_grow)
939                         timeout = 1;
940
941                 mutex_unlock(&root->fs_info->trans_mutex);
942
943                 if (flush_on_commit) {
944                         btrfs_start_delalloc_inodes(root);
945                         ret = btrfs_wait_ordered_extents(root, 0);
946                         BUG_ON(ret);
947                 } else if (snap_pending) {
948                         ret = btrfs_wait_ordered_extents(root, 1);
949                         BUG_ON(ret);
950                 }
951
952                 /*
953                  * rename don't use btrfs_join_transaction, so, once we
954                  * set the transaction to blocked above, we aren't going
955                  * to get any new ordered operations.  We can safely run
956                  * it here and no for sure that nothing new will be added
957                  * to the list
958                  */
959                 btrfs_run_ordered_operations(root, 1);
960
961                 smp_mb();
962                 if (cur_trans->num_writers > 1 || should_grow)
963                         schedule_timeout(timeout);
964
965                 mutex_lock(&root->fs_info->trans_mutex);
966                 finish_wait(&cur_trans->writer_wait, &wait);
967         } while (cur_trans->num_writers > 1 ||
968                  (should_grow && cur_trans->num_joined != joined));
969
970         ret = create_pending_snapshots(trans, root->fs_info);
971         BUG_ON(ret);
972
973         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
974         BUG_ON(ret);
975
976         WARN_ON(cur_trans != trans->transaction);
977
978         /* btrfs_commit_tree_roots is responsible for getting the
979          * various roots consistent with each other.  Every pointer
980          * in the tree of tree roots has to point to the most up to date
981          * root for every subvolume and other tree.  So, we have to keep
982          * the tree logging code from jumping in and changing any
983          * of the trees.
984          *
985          * At this point in the commit, there can't be any tree-log
986          * writers, but a little lower down we drop the trans mutex
987          * and let new people in.  By holding the tree_log_mutex
988          * from now until after the super is written, we avoid races
989          * with the tree-log code.
990          */
991         mutex_lock(&root->fs_info->tree_log_mutex);
992
993         ret = commit_fs_roots(trans, root);
994         BUG_ON(ret);
995
996         /* commit_fs_roots gets rid of all the tree log roots, it is now
997          * safe to free the root of tree log roots
998          */
999         btrfs_free_log_root_tree(trans, root->fs_info);
1000
1001         ret = commit_cowonly_roots(trans, root);
1002         BUG_ON(ret);
1003
1004         btrfs_prepare_extent_commit(trans, root);
1005
1006         cur_trans = root->fs_info->running_transaction;
1007         spin_lock(&root->fs_info->new_trans_lock);
1008         root->fs_info->running_transaction = NULL;
1009         spin_unlock(&root->fs_info->new_trans_lock);
1010
1011         btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1012                             root->fs_info->tree_root->node);
1013         switch_commit_root(root->fs_info->tree_root);
1014
1015         btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1016                             root->fs_info->chunk_root->node);
1017         switch_commit_root(root->fs_info->chunk_root);
1018
1019         update_super_roots(root);
1020
1021         if (!root->fs_info->log_root_recovering) {
1022                 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1023                 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1024         }
1025
1026         memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1027                sizeof(root->fs_info->super_copy));
1028
1029         trans->transaction->blocked = 0;
1030
1031         wake_up(&root->fs_info->transaction_wait);
1032
1033         mutex_unlock(&root->fs_info->trans_mutex);
1034         ret = btrfs_write_and_wait_transaction(trans, root);
1035         BUG_ON(ret);
1036         write_ctree_super(trans, root, 0);
1037
1038         /*
1039          * the super is written, we can safely allow the tree-loggers
1040          * to go about their business
1041          */
1042         mutex_unlock(&root->fs_info->tree_log_mutex);
1043
1044         btrfs_finish_extent_commit(trans, root);
1045
1046         /* do the directory inserts of any pending snapshot creations */
1047         finish_pending_snapshots(trans, root->fs_info);
1048
1049         mutex_lock(&root->fs_info->trans_mutex);
1050
1051         cur_trans->commit_done = 1;
1052
1053         root->fs_info->last_trans_committed = cur_trans->transid;
1054
1055         wake_up(&cur_trans->commit_wait);
1056
1057         put_transaction(cur_trans);
1058         put_transaction(cur_trans);
1059
1060         mutex_unlock(&root->fs_info->trans_mutex);
1061
1062         kmem_cache_free(btrfs_trans_handle_cachep, trans);
1063         return ret;
1064 }
1065
1066 /*
1067  * interface function to delete all the snapshots we have scheduled for deletion
1068  */
1069 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1070 {
1071         LIST_HEAD(list);
1072         struct btrfs_fs_info *fs_info = root->fs_info;
1073
1074         mutex_lock(&fs_info->trans_mutex);
1075         list_splice_init(&fs_info->dead_roots, &list);
1076         mutex_unlock(&fs_info->trans_mutex);
1077
1078         while (!list_empty(&list)) {
1079                 root = list_entry(list.next, struct btrfs_root, root_list);
1080                 list_del(&root->root_list);
1081
1082                 if (btrfs_header_backref_rev(root->node) <
1083                     BTRFS_MIXED_BACKREF_REV)
1084                         btrfs_drop_snapshot(root, 0);
1085                 else
1086                         btrfs_drop_snapshot(root, 1);
1087         }
1088         return 0;
1089 }