Btrfs: fix number of transaction units for renames with whiteout

[cascardo/linux.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 41a5688..0921d2b 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -194,7 +194,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                 while (compressed_size > 0) {
                         cpage = compressed_pages[i];
                         cur_size = min_t(unsigned long, compressed_size,
-                                      PAGE_CACHE_SIZE);
+                                      PAGE_SIZE);
  
                         kaddr = kmap_atomic(cpage);
                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
@@ -208,13 +208,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                                                   compress_type);
         } else {
                 page = find_get_page(inode->i_mapping,
-                                    start >> PAGE_CACHE_SHIFT);
+                                    start >> PAGE_SHIFT);
                 btrfs_set_file_extent_compression(leaf, ei, 0);
                 kaddr = kmap_atomic(page);
-               offset = start & (PAGE_CACHE_SIZE - 1);
+               offset = start & (PAGE_SIZE - 1);
                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
                 kunmap_atomic(kaddr);
-               page_cache_release(page);
+               put_page(page);
         }
         btrfs_mark_buffer_dirty(leaf);
         btrfs_release_path(path);
@@ -322,7 +322,7 @@ out:
          * And at reserve time, it's always aligned to page size, so
          * just free one page here.
          */
-       btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
+       btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
         btrfs_free_path(path);
         btrfs_end_transaction(trans, root);
         return ret;
@@ -435,8 +435,8 @@ static noinline void compress_file_range(struct inode *inode,
         actual_end = min_t(u64, isize, end + 1);
  again:
         will_compress = 0;
-       nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
-       nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
+       nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+       nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
  
         /*
          * we don't want to send crud past the end of i_size through
@@ -514,7 +514,7 @@ again:
  
                 if (!ret) {
                         unsigned long offset = total_compressed &
-                               (PAGE_CACHE_SIZE - 1);
+                               (PAGE_SIZE - 1);
                         struct page *page = pages[nr_pages_ret - 1];
                         char *kaddr;
  
@@ -524,7 +524,7 @@ again:
                         if (offset) {
                                 kaddr = kmap_atomic(page);
                                 memset(kaddr + offset, 0,
-                                      PAGE_CACHE_SIZE - offset);
+                                      PAGE_SIZE - offset);
                                 kunmap_atomic(kaddr);
                         }
                         will_compress = 1;
@@ -580,7 +580,7 @@ cont:
                  * one last check to make sure the compression is really a
                  * win, compare the page count read with the blocks on disk
                  */
-               total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
+               total_in = ALIGN(total_in, PAGE_SIZE);
                 if (total_compressed >= total_in) {
                         will_compress = 0;
                 } else {
@@ -594,7 +594,7 @@ cont:
                  */
                 for (i = 0; i < nr_pages_ret; i++) {
                         WARN_ON(pages[i]->mapping);
-                       page_cache_release(pages[i]);
+                       put_page(pages[i]);
                 }
                 kfree(pages);
                 pages = NULL;
@@ -650,7 +650,7 @@ cleanup_and_bail_uncompressed:
  free_pages_out:
         for (i = 0; i < nr_pages_ret; i++) {
                 WARN_ON(pages[i]->mapping);
-               page_cache_release(pages[i]);
+               put_page(pages[i]);
         }
         kfree(pages);
  }
@@ -664,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
  
         for (i = 0; i < async_extent->nr_pages; i++) {
                 WARN_ON(async_extent->pages[i]->mapping);
-               page_cache_release(async_extent->pages[i]);
+               put_page(async_extent->pages[i]);
         }
         kfree(async_extent->pages);
         async_extent->nr_pages = 0;
@@ -824,6 +824,7 @@ retry:
                                                 async_extent->ram_size - 1, 0);
                         goto out_free_reserve;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
  
                 /*
                  * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
         }
         return;
  out_free_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_free:
         extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -966,7 +968,7 @@ static noinline int cow_file_range(struct inode *inode,
                                      PAGE_END_WRITEBACK);
  
                         *nr_written = *nr_written +
-                            (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+                            (end - start + PAGE_SIZE) / PAGE_SIZE;
                         *page_started = 1;
                         goto out;
                 } else if (ret < 0) {
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
                                 goto out_drop_extent_cache;
                 }
  
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
                 if (disk_num_bytes < cur_alloc_size)
                         break;
  
@@ -1066,6 +1070,7 @@ out:
  out_drop_extent_cache:
         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
  out_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_unlock:
         extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1106,8 +1111,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
         async_cow = container_of(work, struct async_cow, work);
  
         root = async_cow->root;
-       nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
-               PAGE_CACHE_SHIFT;
+       nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
+               PAGE_SHIFT;
  
         /*
          * atomic_sub_return implies a barrier for waitqueue_active
@@ -1164,8 +1169,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                                 async_cow_start, async_cow_submit,
                                 async_cow_free);
  
-               nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
-                       PAGE_CACHE_SHIFT;
+               nr_pages = (cur_end - start + PAGE_SIZE) >>
+                       PAGE_SHIFT;
                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
  
                 btrfs_queue_work(root->fs_info->delalloc_workers,
@@ -1960,7 +1965,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                               struct extent_state **cached_state)
  {
-       WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
+       WARN_ON((end & (PAGE_SIZE - 1)) == 0);
         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
                                    cached_state, GFP_NOFS);
  }
@@ -1993,7 +1998,7 @@ again:
  
         inode = page->mapping->host;
         page_start = page_offset(page);
-       page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+       page_end = page_offset(page) + PAGE_SIZE - 1;
  
         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
                          &cached_state);
@@ -2003,7 +2008,7 @@ again:
                 goto out;
  
         ordered = btrfs_lookup_ordered_range(inode, page_start,
-                                       PAGE_CACHE_SIZE);
+                                       PAGE_SIZE);
         if (ordered) {
                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
                                      page_end, &cached_state, GFP_NOFS);
@@ -2014,7 +2019,7 @@ again:
         }
  
         ret = btrfs_delalloc_reserve_space(inode, page_start,
-                                          PAGE_CACHE_SIZE);
+                                          PAGE_SIZE);
         if (ret) {
                 mapping_set_error(page->mapping, ret);
                 end_extent_writepage(page, ret, page_start, page_end);
@@ -2030,7 +2035,7 @@ out:
                              &cached_state, GFP_NOFS);
  out_page:
         unlock_page(page);
-       page_cache_release(page);
+       put_page(page);
         kfree(fixup);
  }
  
@@ -2063,7 +2068,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
                 return -EAGAIN;
  
         SetPageChecked(page);
-       page_cache_get(page);
+       get_page(page);
         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
                         btrfs_writepage_fixup_worker, NULL, NULL);
         fixup->page = page;
@@ -4247,7 +4252,7 @@ static int truncate_inline_extent(struct inode *inode,
  
         if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
                 loff_t offset = new_size;
-               loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+               loff_t page_end = ALIGN(offset, PAGE_SIZE);
  
                 /*
                  * Zero out the remaining of the last page of our inline extent,
@@ -4633,7 +4638,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
         struct extent_state *cached_state = NULL;
         char *kaddr;
         u32 blocksize = root->sectorsize;
-       pgoff_t index = from >> PAGE_CACHE_SHIFT;
+       pgoff_t index = from >> PAGE_SHIFT;
         unsigned offset = from & (blocksize - 1);
         struct page *page;
         gfp_t mask = btrfs_alloc_write_mask(mapping);
@@ -4668,7 +4673,7 @@ again:
                 lock_page(page);
                 if (page->mapping != mapping) {
                         unlock_page(page);
-                       page_cache_release(page);
+                       put_page(page);
                         goto again;
                 }
                 if (!PageUptodate(page)) {
@@ -4686,7 +4691,7 @@ again:
                 unlock_extent_cached(io_tree, block_start, block_end,
                                      &cached_state, GFP_NOFS);
                 unlock_page(page);
-               page_cache_release(page);
+               put_page(page);
                 btrfs_start_ordered_extent(inode, ordered, 1);
                 btrfs_put_ordered_extent(ordered);
                 goto again;
@@ -4728,7 +4733,7 @@ out_unlock:
                 btrfs_delalloc_release_space(inode, block_start,
                                              blocksize);
         unlock_page(page);
-       page_cache_release(page);
+       put_page(page);
  out:
         return ret;
  }
@@ -6717,7 +6722,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
  
         read_extent_buffer(leaf, tmp, ptr, inline_size);
  
-       max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+       max_size = min_t(unsigned long, PAGE_SIZE, max_size);
         ret = btrfs_decompress(compress_type, tmp, page,
                                extent_offset, inline_size, max_size);
         kfree(tmp);
@@ -6879,8 +6884,8 @@ next:
  
                 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
                 extent_offset = page_offset(page) + pg_offset - extent_start;
-               copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
-                               size - extent_offset);
+               copy_size = min_t(u64, PAGE_SIZE - pg_offset,
+                                 size - extent_offset);
                 em->start = extent_start + extent_offset;
                 em->len = ALIGN(copy_size, root->sectorsize);
                 em->orig_block_len = em->len;
@@ -6899,9 +6904,9 @@ next:
                                 map = kmap(page);
                                 read_extent_buffer(leaf, map + pg_offset, ptr,
                                                    copy_size);
-                               if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+                               if (pg_offset + copy_size < PAGE_SIZE) {
                                         memset(map + pg_offset + copy_size, 0,
-                                              PAGE_CACHE_SIZE - pg_offset -
+                                              PAGE_SIZE - pg_offset -
                                                copy_size);
                                 }
                                 kunmap(page);
@@ -7162,6 +7167,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                 return ERR_PTR(ret);
         }
  
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
         em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
                               ins.offset, ins.offset, ins.offset, 0);
         if (IS_ERR(em)) {
@@ -7336,12 +7343,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
         int start_idx;
         int end_idx;
  
-       start_idx = start >> PAGE_CACHE_SHIFT;
+       start_idx = start >> PAGE_SHIFT;
  
         /*
          * end is the last byte in the last page.  end == start is legal
          */
-       end_idx = end >> PAGE_CACHE_SHIFT;
+       end_idx = end >> PAGE_SHIFT;
  
         rcu_read_lock();
  
@@ -7382,7 +7389,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
                  * include/linux/pagemap.h for details.
                  */
                 if (unlikely(page != *pagep)) {
-                       page_cache_release(page);
+                       put_page(page);
                         page = NULL;
                 }
         }
@@ -7390,7 +7397,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
         if (page) {
                 if (page->index <= end_idx)
                         found = true;
-               page_cache_release(page);
+               put_page(page);
         }
  
         rcu_read_unlock();
@@ -8719,7 +8726,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
         if (ret == 1) {
                 ClearPagePrivate(page);
                 set_page_private(page, 0);
-               page_cache_release(page);
+               put_page(page);
         }
         return ret;
  }
@@ -8739,7 +8746,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
         struct btrfs_ordered_extent *ordered;
         struct extent_state *cached_state = NULL;
         u64 page_start = page_offset(page);
-       u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+       u64 page_end = page_start + PAGE_SIZE - 1;
         u64 start;
         u64 end;
         int inode_evicting = inode->i_state & I_FREEING;
@@ -8822,7 +8829,7 @@ again:
          * 2) Not written to disk
          *    This means the reserved space should be freed here.
          */
-       btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
+       btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
         if (!inode_evicting) {
                 clear_extent_bit(tree, page_start, page_end,
                                  EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8837,7 +8844,7 @@ again:
         if (PagePrivate(page)) {
                 ClearPagePrivate(page);
                 set_page_private(page, 0);
-               page_cache_release(page);
+               put_page(page);
         }
  }
  
@@ -8874,11 +8881,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         u64 page_end;
         u64 end;
  
-       reserved_space = PAGE_CACHE_SIZE;
+       reserved_space = PAGE_SIZE;
  
         sb_start_pagefault(inode->i_sb);
         page_start = page_offset(page);
-       page_end = page_start + PAGE_CACHE_SIZE - 1;
+       page_end = page_start + PAGE_SIZE - 1;
         end = page_end;
  
         /*
@@ -8934,15 +8941,15 @@ again:
                 goto again;
         }
  
-       if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
+       if (page->index == ((size - 1) >> PAGE_SHIFT)) {
                 reserved_space = round_up(size - page_start, root->sectorsize);
-               if (reserved_space < PAGE_CACHE_SIZE) {
+               if (reserved_space < PAGE_SIZE) {
                         end = page_start + reserved_space - 1;
                         spin_lock(&BTRFS_I(inode)->lock);
                         BTRFS_I(inode)->outstanding_extents++;
                         spin_unlock(&BTRFS_I(inode)->lock);
                         btrfs_delalloc_release_space(inode, page_start,
-                                               PAGE_CACHE_SIZE - reserved_space);
+                                               PAGE_SIZE - reserved_space);
                 }
         }
  
@@ -8969,14 +8976,14 @@ again:
         ret = 0;
  
         /* page is wholly or partially inside EOF */
-       if (page_start + PAGE_CACHE_SIZE > size)
-               zero_start = size & ~PAGE_CACHE_MASK;
+       if (page_start + PAGE_SIZE > size)
+               zero_start = size & ~PAGE_MASK;
         else
-               zero_start = PAGE_CACHE_SIZE;
+               zero_start = PAGE_SIZE;
  
-       if (zero_start != PAGE_CACHE_SIZE) {
+       if (zero_start != PAGE_SIZE) {
                 kaddr = kmap(page);
-               memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+               memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
                 flush_dcache_page(page);
                 kunmap(page);
         }
@@ -9387,10 +9394,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
         return 0;
  }
  
+static int btrfs_rename_exchange(struct inode *old_dir,
+                             struct dentry *old_dentry,
+                             struct inode *new_dir,
+                             struct dentry *new_dentry)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct inode *old_inode = old_dentry->d_inode;
+       struct timespec ctime = CURRENT_TIME;
+       struct dentry *parent;
+       u64 old_ino = btrfs_ino(old_inode);
+       u64 new_ino = btrfs_ino(new_inode);
+       u64 old_idx = 0;
+       u64 new_idx = 0;
+       u64 root_objectid;
+       int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
+
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+               return -EXDEV;
+
+       /* close the race window with snapshot create/destroy ioctl */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&dest->fs_info->subvol_sem);
+
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 12);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
+
+       /*
+        * We need to find a free sequence number both in the source and
+        * in the destination directory for the exchange.
+        */
+       ret = btrfs_set_inode_index(new_dir, &old_idx);
+       if (ret)
+               goto out_fail;
+       ret = btrfs_set_inode_index(old_dir, &new_idx);
+       if (ret)
+               goto out_fail;
+
+       BTRFS_I(old_inode)->dir_index = 0ULL;
+       BTRFS_I(new_inode)->dir_index = 0ULL;
+
+       /* Reference for the source. */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(root->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_ino,
+                                            btrfs_ino(new_dir), old_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* And now for the dest. */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(dest->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            old_dentry->d_name.name,
+                                            old_dentry->d_name.len,
+                                            new_ino,
+                                            btrfs_ino(old_dir), new_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* Update inode version and ctime/mtime. */
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
+       inode_inc_iversion(new_inode);
+       old_dir->i_ctime = old_dir->i_mtime = ctime;
+       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_inode->i_ctime = ctime;
+       new_inode->i_ctime = ctime;
+
+       if (old_dentry->d_parent != new_dentry->d_parent) {
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+               btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+       }
+
+       /* src is a subvolume */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir,
+                                         root_objectid,
+                                         old_dentry->d_name.name,
+                                         old_dentry->d_name.len);
+       } else { /* src is an inode */
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                          old_dentry->d_inode,
+                                          old_dentry->d_name.name,
+                                          old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       /* dest is a subvolume */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                         root_objectid,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+       } else { /* dest is an inode */
+               ret = __btrfs_unlink_inode(trans, dest, new_dir,
+                                          new_dentry->d_inode,
+                                          new_dentry->d_name.name,
+                                          new_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, dest, new_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, old_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, old_dir, new_inode,
+                            old_dentry->d_name.name,
+                            old_dentry->d_name.len, 0, new_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       if (old_inode->i_nlink == 1)
+               BTRFS_I(old_inode)->dir_index = old_idx;
+       if (new_inode->i_nlink == 1)
+               BTRFS_I(new_inode)->dir_index = new_idx;
+
+       if (root_log_pinned) {
+               parent = new_dentry->d_parent;
+               btrfs_log_new_name(trans, old_inode, old_dir, parent);
+               btrfs_end_log_trans(root);
+               root_log_pinned = false;
+       }
+       if (dest_log_pinned) {
+               parent = old_dentry->d_parent;
+               btrfs_log_new_name(trans, new_inode, new_dir, parent);
+               btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
+       }
+out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
+       ret = btrfs_end_transaction(trans, root);
+out_notrans:
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&dest->fs_info->subvol_sem);
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
+
+       return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *dir,
+                                    struct dentry *dentry)
+{
+       int ret;
+       struct inode *inode;
+       u64 objectid;
+       u64 index;
+
+       ret = btrfs_find_free_ino(root, &objectid);
+       if (ret)
+               return ret;
+
+       inode = btrfs_new_inode(trans, root, dir,
+                               dentry->d_name.name,
+                               dentry->d_name.len,
+                               btrfs_ino(dir),
+                               objectid,
+                               S_IFCHR | WHITEOUT_MODE,
+                               &index);
+
+       if (IS_ERR(inode)) {
+               ret = PTR_ERR(inode);
+               return ret;
+       }
+
+       inode->i_op = &btrfs_special_inode_operations;
+       init_special_inode(inode, inode->i_mode,
+               WHITEOUT_DEV);
+
+       ret = btrfs_init_inode_security(trans, inode, dir,
+                               &dentry->d_name);
+       if (ret)
+               goto out;
+
+       ret = btrfs_add_nondir(trans, dir, dentry,
+                               inode, 0, index);
+       if (ret)
+               goto out;
+
+       ret = btrfs_update_inode(trans, root, inode);
+out:
+       unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
+       iput(inode);
+
+       return ret;
+}
+
  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
  {
         struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
         struct btrfs_root *root = BTRFS_I(old_dir)->root;
         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
         struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9677,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         u64 root_objectid;
         int ret;
         u64 old_ino = btrfs_ino(old_inode);
+       bool log_pinned = false;
  
         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                 return -EPERM;
@@ -9449,15 +9728,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
          * We want to reserve the absolute worst case amount of items.  So if
          * both inodes are subvols and we need to unlink them then that would
          * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
+        * would require 5 item modifications, so we'll assume they are normal
          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
          * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
          */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
         if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                goto out_notrans;
-        }
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
  
         if (dest != root)
                 btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9756,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(root->fs_info, trans);
         } else {
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9478,14 +9765,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                              btrfs_ino(new_dir), index);
                 if (ret)
                         goto out_fail;
-               /*
-                * this is an ugly little race, but the rename is required
-                * to make sure that if we crash, the inode is either at the
-                * old name or the new one.  pinning the log transaction lets
-                * us make sure we don't allow a log commit to come in after
-                * we unlink the name but before we add the new name back in.
-                */
-               btrfs_pin_log_trans(root);
         }
  
         inode_inc_iversion(old_dir);
@@ -9552,12 +9831,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (old_inode->i_nlink == 1)
                 BTRFS_I(old_inode)->dir_index = index;
  
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (log_pinned) {
                 struct dentry *parent = new_dentry->d_parent;
+
                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
                 btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
+
+       if (flags & RENAME_WHITEOUT) {
+               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+                                               old_dentry);
+
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
         }
  out_fail:
+       /*
+        * If we have pinned the log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && log_pinned) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
         btrfs_end_transaction(trans, root);
  out_notrans:
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9883,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
  {
-       if (flags & ~RENAME_NOREPLACE)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
  
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (flags & RENAME_EXCHANGE)
+               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                         new_dentry);
+
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
  }
  
  static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10259,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                 btrfs_end_transaction(trans, root);
                         break;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
  
                 last_alloc = ins.offset;
                 ret = insert_reserved_file_extent(trans, inode,