Btrfs: fix number of transaction units for renames with whiteout
[cascardo/linux.git] / fs / btrfs / inode.c
index 41a5688..0921d2b 100644 (file)
@@ -194,7 +194,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                while (compressed_size > 0) {
                        cpage = compressed_pages[i];
                        cur_size = min_t(unsigned long, compressed_size,
-                                      PAGE_CACHE_SIZE);
+                                      PAGE_SIZE);
 
                        kaddr = kmap_atomic(cpage);
                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
@@ -208,13 +208,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                                                  compress_type);
        } else {
                page = find_get_page(inode->i_mapping,
-                                    start >> PAGE_CACHE_SHIFT);
+                                    start >> PAGE_SHIFT);
                btrfs_set_file_extent_compression(leaf, ei, 0);
                kaddr = kmap_atomic(page);
-               offset = start & (PAGE_CACHE_SIZE - 1);
+               offset = start & (PAGE_SIZE - 1);
                write_extent_buffer(leaf, kaddr + offset, ptr, size);
                kunmap_atomic(kaddr);
-               page_cache_release(page);
+               put_page(page);
        }
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
@@ -322,7 +322,7 @@ out:
         * And at reserve time, it's always aligned to page size, so
         * just free one page here.
         */
-       btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
+       btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
        btrfs_free_path(path);
        btrfs_end_transaction(trans, root);
        return ret;
@@ -435,8 +435,8 @@ static noinline void compress_file_range(struct inode *inode,
        actual_end = min_t(u64, isize, end + 1);
 again:
        will_compress = 0;
-       nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
-       nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
+       nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+       nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
 
        /*
         * we don't want to send crud past the end of i_size through
@@ -514,7 +514,7 @@ again:
 
                if (!ret) {
                        unsigned long offset = total_compressed &
-                               (PAGE_CACHE_SIZE - 1);
+                               (PAGE_SIZE - 1);
                        struct page *page = pages[nr_pages_ret - 1];
                        char *kaddr;
 
@@ -524,7 +524,7 @@ again:
                        if (offset) {
                                kaddr = kmap_atomic(page);
                                memset(kaddr + offset, 0,
-                                      PAGE_CACHE_SIZE - offset);
+                                      PAGE_SIZE - offset);
                                kunmap_atomic(kaddr);
                        }
                        will_compress = 1;
@@ -580,7 +580,7 @@ cont:
                 * one last check to make sure the compression is really a
                 * win, compare the page count read with the blocks on disk
                 */
-               total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
+               total_in = ALIGN(total_in, PAGE_SIZE);
                if (total_compressed >= total_in) {
                        will_compress = 0;
                } else {
@@ -594,7 +594,7 @@ cont:
                 */
                for (i = 0; i < nr_pages_ret; i++) {
                        WARN_ON(pages[i]->mapping);
-                       page_cache_release(pages[i]);
+                       put_page(pages[i]);
                }
                kfree(pages);
                pages = NULL;
@@ -650,7 +650,7 @@ cleanup_and_bail_uncompressed:
 free_pages_out:
        for (i = 0; i < nr_pages_ret; i++) {
                WARN_ON(pages[i]->mapping);
-               page_cache_release(pages[i]);
+               put_page(pages[i]);
        }
        kfree(pages);
 }
@@ -664,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
 
        for (i = 0; i < async_extent->nr_pages; i++) {
                WARN_ON(async_extent->pages[i]->mapping);
-               page_cache_release(async_extent->pages[i]);
+               put_page(async_extent->pages[i]);
        }
        kfree(async_extent->pages);
        async_extent->nr_pages = 0;
@@ -824,6 +824,7 @@ retry:
                                                async_extent->ram_size - 1, 0);
                        goto out_free_reserve;
                }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
 
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
        }
        return;
 out_free_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
        extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -966,7 +968,7 @@ static noinline int cow_file_range(struct inode *inode,
                                     PAGE_END_WRITEBACK);
 
                        *nr_written = *nr_written +
-                            (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+                            (end - start + PAGE_SIZE) / PAGE_SIZE;
                        *page_started = 1;
                        goto out;
                } else if (ret < 0) {
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
                                goto out_drop_extent_cache;
                }
 
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
                if (disk_num_bytes < cur_alloc_size)
                        break;
 
@@ -1066,6 +1070,7 @@ out:
 out_drop_extent_cache:
        btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
 out_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
        extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1106,8 +1111,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
        async_cow = container_of(work, struct async_cow, work);
 
        root = async_cow->root;
-       nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
-               PAGE_CACHE_SHIFT;
+       nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
+               PAGE_SHIFT;
 
        /*
         * atomic_sub_return implies a barrier for waitqueue_active
@@ -1164,8 +1169,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                                async_cow_start, async_cow_submit,
                                async_cow_free);
 
-               nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
-                       PAGE_CACHE_SHIFT;
+               nr_pages = (cur_end - start + PAGE_SIZE) >>
+                       PAGE_SHIFT;
                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
 
                btrfs_queue_work(root->fs_info->delalloc_workers,
@@ -1960,7 +1965,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state)
 {
-       WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
+       WARN_ON((end & (PAGE_SIZE - 1)) == 0);
        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
                                   cached_state, GFP_NOFS);
 }
@@ -1993,7 +1998,7 @@ again:
 
        inode = page->mapping->host;
        page_start = page_offset(page);
-       page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+       page_end = page_offset(page) + PAGE_SIZE - 1;
 
        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
                         &cached_state);
@@ -2003,7 +2008,7 @@ again:
                goto out;
 
        ordered = btrfs_lookup_ordered_range(inode, page_start,
-                                       PAGE_CACHE_SIZE);
+                                       PAGE_SIZE);
        if (ordered) {
                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
                                     page_end, &cached_state, GFP_NOFS);
@@ -2014,7 +2019,7 @@ again:
        }
 
        ret = btrfs_delalloc_reserve_space(inode, page_start,
-                                          PAGE_CACHE_SIZE);
+                                          PAGE_SIZE);
        if (ret) {
                mapping_set_error(page->mapping, ret);
                end_extent_writepage(page, ret, page_start, page_end);
@@ -2030,7 +2035,7 @@ out:
                             &cached_state, GFP_NOFS);
 out_page:
        unlock_page(page);
-       page_cache_release(page);
+       put_page(page);
        kfree(fixup);
 }
 
@@ -2063,7 +2068,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
                return -EAGAIN;
 
        SetPageChecked(page);
-       page_cache_get(page);
+       get_page(page);
        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
                        btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
@@ -4247,7 +4252,7 @@ static int truncate_inline_extent(struct inode *inode,
 
        if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
                loff_t offset = new_size;
-               loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+               loff_t page_end = ALIGN(offset, PAGE_SIZE);
 
                /*
                 * Zero out the remaining of the last page of our inline extent,
@@ -4633,7 +4638,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
        struct extent_state *cached_state = NULL;
        char *kaddr;
        u32 blocksize = root->sectorsize;
-       pgoff_t index = from >> PAGE_CACHE_SHIFT;
+       pgoff_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (blocksize - 1);
        struct page *page;
        gfp_t mask = btrfs_alloc_write_mask(mapping);
@@ -4668,7 +4673,7 @@ again:
                lock_page(page);
                if (page->mapping != mapping) {
                        unlock_page(page);
-                       page_cache_release(page);
+                       put_page(page);
                        goto again;
                }
                if (!PageUptodate(page)) {
@@ -4686,7 +4691,7 @@ again:
                unlock_extent_cached(io_tree, block_start, block_end,
                                     &cached_state, GFP_NOFS);
                unlock_page(page);
-               page_cache_release(page);
+               put_page(page);
                btrfs_start_ordered_extent(inode, ordered, 1);
                btrfs_put_ordered_extent(ordered);
                goto again;
@@ -4728,7 +4733,7 @@ out_unlock:
                btrfs_delalloc_release_space(inode, block_start,
                                             blocksize);
        unlock_page(page);
-       page_cache_release(page);
+       put_page(page);
 out:
        return ret;
 }
@@ -6717,7 +6722,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 
        read_extent_buffer(leaf, tmp, ptr, inline_size);
 
-       max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+       max_size = min_t(unsigned long, PAGE_SIZE, max_size);
        ret = btrfs_decompress(compress_type, tmp, page,
                               extent_offset, inline_size, max_size);
        kfree(tmp);
@@ -6879,8 +6884,8 @@ next:
 
                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
                extent_offset = page_offset(page) + pg_offset - extent_start;
-               copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
-                               size - extent_offset);
+               copy_size = min_t(u64, PAGE_SIZE - pg_offset,
+                                 size - extent_offset);
                em->start = extent_start + extent_offset;
                em->len = ALIGN(copy_size, root->sectorsize);
                em->orig_block_len = em->len;
@@ -6899,9 +6904,9 @@ next:
                                map = kmap(page);
                                read_extent_buffer(leaf, map + pg_offset, ptr,
                                                   copy_size);
-                               if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+                               if (pg_offset + copy_size < PAGE_SIZE) {
                                        memset(map + pg_offset + copy_size, 0,
-                                              PAGE_CACHE_SIZE - pg_offset -
+                                              PAGE_SIZE - pg_offset -
                                               copy_size);
                                }
                                kunmap(page);
@@ -7162,6 +7167,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                return ERR_PTR(ret);
        }
 
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
                              ins.offset, ins.offset, ins.offset, 0);
        if (IS_ERR(em)) {
@@ -7336,12 +7343,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
        int start_idx;
        int end_idx;
 
-       start_idx = start >> PAGE_CACHE_SHIFT;
+       start_idx = start >> PAGE_SHIFT;
 
        /*
         * end is the last byte in the last page.  end == start is legal
         */
-       end_idx = end >> PAGE_CACHE_SHIFT;
+       end_idx = end >> PAGE_SHIFT;
 
        rcu_read_lock();
 
@@ -7382,7 +7389,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
                 * include/linux/pagemap.h for details.
                 */
                if (unlikely(page != *pagep)) {
-                       page_cache_release(page);
+                       put_page(page);
                        page = NULL;
                }
        }
@@ -7390,7 +7397,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
        if (page) {
                if (page->index <= end_idx)
                        found = true;
-               page_cache_release(page);
+               put_page(page);
        }
 
        rcu_read_unlock();
@@ -8719,7 +8726,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
        if (ret == 1) {
                ClearPagePrivate(page);
                set_page_private(page, 0);
-               page_cache_release(page);
+               put_page(page);
        }
        return ret;
 }
@@ -8739,7 +8746,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
-       u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+       u64 page_end = page_start + PAGE_SIZE - 1;
        u64 start;
        u64 end;
        int inode_evicting = inode->i_state & I_FREEING;
@@ -8822,7 +8829,7 @@ again:
         * 2) Not written to disk
         *    This means the reserved space should be freed here.
         */
-       btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
+       btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
        if (!inode_evicting) {
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8837,7 +8844,7 @@ again:
        if (PagePrivate(page)) {
                ClearPagePrivate(page);
                set_page_private(page, 0);
-               page_cache_release(page);
+               put_page(page);
        }
 }
 
@@ -8874,11 +8881,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_end;
        u64 end;
 
-       reserved_space = PAGE_CACHE_SIZE;
+       reserved_space = PAGE_SIZE;
 
        sb_start_pagefault(inode->i_sb);
        page_start = page_offset(page);
-       page_end = page_start + PAGE_CACHE_SIZE - 1;
+       page_end = page_start + PAGE_SIZE - 1;
        end = page_end;
 
        /*
@@ -8934,15 +8941,15 @@ again:
                goto again;
        }
 
-       if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
+       if (page->index == ((size - 1) >> PAGE_SHIFT)) {
                reserved_space = round_up(size - page_start, root->sectorsize);
-               if (reserved_space < PAGE_CACHE_SIZE) {
+               if (reserved_space < PAGE_SIZE) {
                        end = page_start + reserved_space - 1;
                        spin_lock(&BTRFS_I(inode)->lock);
                        BTRFS_I(inode)->outstanding_extents++;
                        spin_unlock(&BTRFS_I(inode)->lock);
                        btrfs_delalloc_release_space(inode, page_start,
-                                               PAGE_CACHE_SIZE - reserved_space);
+                                               PAGE_SIZE - reserved_space);
                }
        }
 
@@ -8969,14 +8976,14 @@ again:
        ret = 0;
 
        /* page is wholly or partially inside EOF */
-       if (page_start + PAGE_CACHE_SIZE > size)
-               zero_start = size & ~PAGE_CACHE_MASK;
+       if (page_start + PAGE_SIZE > size)
+               zero_start = size & ~PAGE_MASK;
        else
-               zero_start = PAGE_CACHE_SIZE;
+               zero_start = PAGE_SIZE;
 
-       if (zero_start != PAGE_CACHE_SIZE) {
+       if (zero_start != PAGE_SIZE) {
                kaddr = kmap(page);
-               memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+               memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
                flush_dcache_page(page);
                kunmap(page);
        }
@@ -9387,10 +9394,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
        return 0;
 }
 
+static int btrfs_rename_exchange(struct inode *old_dir,
+                             struct dentry *old_dentry,
+                             struct inode *new_dir,
+                             struct dentry *new_dentry)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct inode *old_inode = old_dentry->d_inode;
+       struct timespec ctime = CURRENT_TIME;
+       struct dentry *parent;
+       u64 old_ino = btrfs_ino(old_inode);
+       u64 new_ino = btrfs_ino(new_inode);
+       u64 old_idx = 0;
+       u64 new_idx = 0;
+       u64 root_objectid;
+       int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
+
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+               return -EXDEV;
+
+       /* close the race window with snapshot create/destroy ioctl */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&dest->fs_info->subvol_sem);
+
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 12);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
+
+       /*
+        * We need to find a free sequence number both in the source and
+        * in the destination directory for the exchange.
+        */
+       ret = btrfs_set_inode_index(new_dir, &old_idx);
+       if (ret)
+               goto out_fail;
+       ret = btrfs_set_inode_index(old_dir, &new_idx);
+       if (ret)
+               goto out_fail;
+
+       BTRFS_I(old_inode)->dir_index = 0ULL;
+       BTRFS_I(new_inode)->dir_index = 0ULL;
+
+       /* Reference for the source. */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(root->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_ino,
+                                            btrfs_ino(new_dir), old_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* And now for the dest. */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(dest->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            old_dentry->d_name.name,
+                                            old_dentry->d_name.len,
+                                            new_ino,
+                                            btrfs_ino(old_dir), new_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* Update inode version and ctime/mtime. */
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
+       inode_inc_iversion(new_inode);
+       old_dir->i_ctime = old_dir->i_mtime = ctime;
+       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_inode->i_ctime = ctime;
+       new_inode->i_ctime = ctime;
+
+       if (old_dentry->d_parent != new_dentry->d_parent) {
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+               btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+       }
+
+       /* src is a subvolume */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir,
+                                         root_objectid,
+                                         old_dentry->d_name.name,
+                                         old_dentry->d_name.len);
+       } else { /* src is an inode */
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                          old_dentry->d_inode,
+                                          old_dentry->d_name.name,
+                                          old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       /* dest is a subvolume */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                         root_objectid,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+       } else { /* dest is an inode */
+               ret = __btrfs_unlink_inode(trans, dest, new_dir,
+                                          new_dentry->d_inode,
+                                          new_dentry->d_name.name,
+                                          new_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, dest, new_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, old_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, old_dir, new_inode,
+                            old_dentry->d_name.name,
+                            old_dentry->d_name.len, 0, new_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       if (old_inode->i_nlink == 1)
+               BTRFS_I(old_inode)->dir_index = old_idx;
+       if (new_inode->i_nlink == 1)
+               BTRFS_I(new_inode)->dir_index = new_idx;
+
+       if (root_log_pinned) {
+               parent = new_dentry->d_parent;
+               btrfs_log_new_name(trans, old_inode, old_dir, parent);
+               btrfs_end_log_trans(root);
+               root_log_pinned = false;
+       }
+       if (dest_log_pinned) {
+               parent = old_dentry->d_parent;
+               btrfs_log_new_name(trans, new_inode, new_dir, parent);
+               btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
+       }
+out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
+       ret = btrfs_end_transaction(trans, root);
+out_notrans:
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&dest->fs_info->subvol_sem);
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
+
+       return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *dir,
+                                    struct dentry *dentry)
+{
+       int ret;
+       struct inode *inode;
+       u64 objectid;
+       u64 index;
+
+       ret = btrfs_find_free_ino(root, &objectid);
+       if (ret)
+               return ret;
+
+       inode = btrfs_new_inode(trans, root, dir,
+                               dentry->d_name.name,
+                               dentry->d_name.len,
+                               btrfs_ino(dir),
+                               objectid,
+                               S_IFCHR | WHITEOUT_MODE,
+                               &index);
+
+       if (IS_ERR(inode)) {
+               ret = PTR_ERR(inode);
+               return ret;
+       }
+
+       inode->i_op = &btrfs_special_inode_operations;
+       init_special_inode(inode, inode->i_mode,
+               WHITEOUT_DEV);
+
+       ret = btrfs_init_inode_security(trans, inode, dir,
+                               &dentry->d_name);
+       if (ret)
+               goto out;
+
+       ret = btrfs_add_nondir(trans, dir, dentry,
+                               inode, 0, index);
+       if (ret)
+               goto out;
+
+       ret = btrfs_update_inode(trans, root, inode);
+out:
+       unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
+       iput(inode);
+
+       return ret;
+}
+
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
 {
        struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
        struct btrfs_root *root = BTRFS_I(old_dir)->root;
        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
        struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9677,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        u64 root_objectid;
        int ret;
        u64 old_ino = btrfs_ino(old_inode);
+       bool log_pinned = false;
 
        if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return -EPERM;
@@ -9449,15 +9728,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * We want to reserve the absolute worst case amount of items.  So if
         * both inodes are subvols and we need to unlink them then that would
         * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
+        * would require 5 item modifications, so we'll assume they are normal
         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
         * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
         */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                goto out_notrans;
-        }
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
 
        if (dest != root)
                btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9756,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(root->fs_info, trans);
        } else {
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
                                             new_dentry->d_name.len,
@@ -9478,14 +9765,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                             btrfs_ino(new_dir), index);
                if (ret)
                        goto out_fail;
-               /*
-                * this is an ugly little race, but the rename is required
-                * to make sure that if we crash, the inode is either at the
-                * old name or the new one.  pinning the log transaction lets
-                * us make sure we don't allow a log commit to come in after
-                * we unlink the name but before we add the new name back in.
-                */
-               btrfs_pin_log_trans(root);
        }
 
        inode_inc_iversion(old_dir);
@@ -9552,12 +9831,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (old_inode->i_nlink == 1)
                BTRFS_I(old_inode)->dir_index = index;
 
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (log_pinned) {
                struct dentry *parent = new_dentry->d_parent;
+
                btrfs_log_new_name(trans, old_inode, old_dir, parent);
                btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
+
+       if (flags & RENAME_WHITEOUT) {
+               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+                                               old_dentry);
+
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
        }
 out_fail:
+       /*
+        * If we have pinned the log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && log_pinned) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
        btrfs_end_transaction(trans, root);
 out_notrans:
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9883,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
 {
-       if (flags & ~RENAME_NOREPLACE)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;
 
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (flags & RENAME_EXCHANGE)
+               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                         new_dentry);
+
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10259,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                btrfs_end_transaction(trans, root);
                        break;
                }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
 
                last_alloc = ins.offset;
                ret = insert_reserved_file_extent(trans, inode,