Merge tag 'xfs-reflink-for-linus-4.9-rc1' of git://git.kernel.org/pub/scm/linux/kerne...

[cascardo/linux.git] / fs / xfs / xfs_file.c
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index e612a02..a314fc7 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
  #include "xfs_icache.h"
  #include "xfs_pnfs.h"
  #include "xfs_iomap.h"
+#include "xfs_reflink.h"
  
  #include <linux/dcache.h>
  #include <linux/falloc.h>
@@ -269,6 +270,8 @@ xfs_file_dio_aio_read(
                 return -EINVAL;
         }
  
+       file_accessed(iocb->ki_filp);
+
         /*
          * Locking is a bit tricky here. If we take an exclusive lock for direct
          * IO, we effectively serialise all new concurrent read IO to this file
@@ -317,13 +320,12 @@ xfs_file_dio_aio_read(
         data = *to;
         ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
                         xfs_get_blocks_direct, NULL, NULL, 0);
-       if (ret > 0) {
+       if (ret >= 0) {
                 iocb->ki_pos += ret;
                 iov_iter_advance(to, ret);
         }
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
-       file_accessed(iocb->ki_filp);
         return ret;
  }
  
@@ -332,10 +334,7 @@ xfs_file_dax_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
  {
-       struct address_space    *mapping = iocb->ki_filp->f_mapping;
-       struct inode            *inode = mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct iov_iter         data = *to;
+       struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
         size_t                  count = iov_iter_count(to);
         ssize_t                 ret = 0;
  
@@ -345,11 +344,7 @@ xfs_file_dax_read(
                 return 0; /* skip atime */
  
         xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
-       if (ret > 0) {
-               iocb->ki_pos += ret;
-               iov_iter_advance(to, ret);
-       }
+       ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
         file_accessed(iocb->ki_filp);
@@ -399,45 +394,6 @@ xfs_file_read_iter(
         return ret;
  }
  
-STATIC ssize_t
-xfs_file_splice_read(
-       struct file             *infilp,
-       loff_t                  *ppos,
-       struct pipe_inode_info  *pipe,
-       size_t                  count,
-       unsigned int            flags)
-{
-       struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-       ssize_t                 ret;
-
-       XFS_STATS_INC(ip->i_mount, xs_read_calls);
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return -EIO;
-
-       trace_xfs_file_splice_read(ip, count, *ppos);
-
-       /*
-        * DAX inodes cannot ues the page cache for splice, so we have to push
-        * them through the VFS IO path. This means it goes through
-        * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
-        * cannot lock the splice operation at this level for DAX inodes.
-        */
-       if (IS_DAX(VFS_I(ip))) {
-               ret = default_file_splice_read(infilp, ppos, pipe, count,
-                                              flags);
-               goto out;
-       }
-
-       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
-out:
-       if (ret > 0)
-               XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
-       return ret;
-}
-
  /*
   * Zero any on disk space between the current EOF and the new, larger EOF.
   *
@@ -679,6 +635,13 @@ xfs_file_dio_aio_write(
  
         trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
  
+       /* If this is a block-aligned directio CoW, remap immediately. */
+       if (xfs_is_reflink_inode(ip) && !unaligned_io) {
+               ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
+               if (ret)
+                       goto out;
+       }
+
         data = *from;
         ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
                         xfs_get_blocks_direct, xfs_end_io_direct_write,
@@ -711,70 +674,32 @@ xfs_file_dax_write(
         struct kiocb            *iocb,
         struct iov_iter         *from)
  {
-       struct address_space    *mapping = iocb->ki_filp->f_mapping;
-       struct inode            *inode = mapping->host;
+       struct inode            *inode = iocb->ki_filp->f_mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       ssize_t                 ret = 0;
-       int                     unaligned_io = 0;
-       int                     iolock;
-       struct iov_iter         data;
+       int                     iolock = XFS_IOLOCK_EXCL;
+       ssize_t                 ret, error = 0;
+       size_t                  count;
+       loff_t                  pos;
  
-       /* "unaligned" here means not aligned to a filesystem block */
-       if ((iocb->ki_pos & mp->m_blockmask) ||
-           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
-               unaligned_io = 1;
-               iolock = XFS_IOLOCK_EXCL;
-       } else if (mapping->nrpages) {
-               iolock = XFS_IOLOCK_EXCL;
-       } else {
-               iolock = XFS_IOLOCK_SHARED;
-       }
         xfs_rw_ilock(ip, iolock);
-
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
  
-       /*
-        * Yes, even DAX files can have page cache attached to them:  A zeroed
-        * page is inserted into the pagecache when we have to serve a write
-        * fault on a hole.  It should never be dirtied and can simply be
-        * dropped from the pagecache once we get real data for the page.
-        *
-        * XXX: This is racy against mmap, and there's nothing we can do about
-        * it. dax_do_io() should really do this invalidation internally as
-        * it will know if we've allocated over a holei for this specific IO and
-        * if so it needs to update the mapping tree and invalidate existing
-        * PTEs over the newly allocated range. Remove this invalidation when
-        * dax_do_io() is fixed up.
-        */
-       if (mapping->nrpages) {
-               loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+       pos = iocb->ki_pos;
+       count = iov_iter_count(from);
  
-               ret = invalidate_inode_pages2_range(mapping,
-                                                   iocb->ki_pos >> PAGE_SHIFT,
-                                                   end >> PAGE_SHIFT);
-               WARN_ON_ONCE(ret);
-       }
+       trace_xfs_file_dax_write(ip, count, pos);
  
-       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
-               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-               iolock = XFS_IOLOCK_SHARED;
+       ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+       if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+               i_size_write(inode, iocb->ki_pos);
+               error = xfs_setfilesize(ip, pos, ret);
         }
  
-       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
-
-       data = *from;
-       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-                       xfs_end_io_direct_write, 0);
-       if (ret > 0) {
-               iocb->ki_pos += ret;
-               iov_iter_advance(from, ret);
-       }
  out:
         xfs_rw_iunlock(ip, iolock);
-       return ret;
+       return error ? error : ret;
  }
  
  STATIC ssize_t
@@ -818,6 +743,9 @@ write_retry:
                 enospc = xfs_inode_free_quota_eofblocks(ip);
                 if (enospc)
                         goto write_retry;
+               enospc = xfs_inode_free_quota_cowblocks(ip);
+               if (enospc)
+                       goto write_retry;
         } else if (ret == -ENOSPC && !enospc) {
                 struct xfs_eofblocks eofb = {0};
  
@@ -857,10 +785,20 @@ xfs_file_write_iter(
  
         if (IS_DAX(inode))
                 ret = xfs_file_dax_write(iocb, from);
-       else if (iocb->ki_flags & IOCB_DIRECT)
+       else if (iocb->ki_flags & IOCB_DIRECT) {
+               /*
+                * Allow a directio write to fall back to a buffered
+                * write *only* in the case that we're doing a reflink
+                * CoW.  In all other directio scenarios we do not
+                * allow an operation to fall back to buffered mode.
+                */
                 ret = xfs_file_dio_aio_write(iocb, from);
-       else
+               if (ret == -EREMCHG)
+                       goto buffered;
+       } else {
+buffered:
                 ret = xfs_file_buffered_aio_write(iocb, from);
+       }
  
         if (ret > 0) {
                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
@@ -874,7 +812,7 @@ xfs_file_write_iter(
  #define        XFS_FALLOC_FL_SUPPORTED                                         \
                 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
                  FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
-                FALLOC_FL_INSERT_RANGE)
+                FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
  
  STATIC long
  xfs_file_fallocate(
@@ -964,9 +902,15 @@ xfs_file_fallocate(
  
                 if (mode & FALLOC_FL_ZERO_RANGE)
                         error = xfs_zero_file_space(ip, offset, len);
-               else
+               else {
+                       if (mode & FALLOC_FL_UNSHARE_RANGE) {
+                               error = xfs_reflink_unshare(ip, offset, len);
+                               if (error)
+                                       goto out_unlock;
+                       }
                         error = xfs_alloc_file_space(ip, offset, len,
                                                      XFS_BMAPI_PREALLOC);
+               }
                 if (error)
                         goto out_unlock;
         }
@@ -984,7 +928,7 @@ xfs_file_fallocate(
  
                 iattr.ia_valid = ATTR_SIZE;
                 iattr.ia_size = new_size;
-               error = xfs_setattr_size(ip, &iattr);
+               error = xfs_vn_setattr_size(file_dentry(file), &iattr);
                 if (error)
                         goto out_unlock;
         }
@@ -1003,6 +947,189 @@ out_unlock:
         return error;
  }
  
+/*
+ * Flush all file writes out to disk.
+ */
+static int
+xfs_file_wait_for_io(
+       struct inode    *inode,
+       loff_t          offset,
+       size_t          len)
+{
+       loff_t          rounding;
+       loff_t          ioffset;
+       loff_t          iendoffset;
+       loff_t          bs;
+       int             ret;
+
+       bs = inode->i_sb->s_blocksize;
+       inode_dio_wait(inode);
+
+       rounding = max_t(xfs_off_t, bs, PAGE_SIZE);
+       ioffset = round_down(offset, rounding);
+       iendoffset = round_up(offset + len, rounding) - 1;
+       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+                                          iendoffset);
+       return ret;
+}
+
+/* Hook up to the VFS reflink function */
+STATIC int
+xfs_file_share_range(
+       struct file     *file_in,
+       loff_t          pos_in,
+       struct file     *file_out,
+       loff_t          pos_out,
+       u64             len,
+       bool            is_dedupe)
+{
+       struct inode    *inode_in;
+       struct inode    *inode_out;
+       ssize_t         ret;
+       loff_t          bs;
+       loff_t          isize;
+       int             same_inode;
+       loff_t          blen;
+       unsigned int    flags = 0;
+
+       inode_in = file_inode(file_in);
+       inode_out = file_inode(file_out);
+       bs = inode_out->i_sb->s_blocksize;
+
+       /* Don't touch certain kinds of inodes */
+       if (IS_IMMUTABLE(inode_out))
+               return -EPERM;
+       if (IS_SWAPFILE(inode_in) ||
+           IS_SWAPFILE(inode_out))
+               return -ETXTBSY;
+
+       /* Reflink only works within this filesystem. */
+       if (inode_in->i_sb != inode_out->i_sb)
+               return -EXDEV;
+       same_inode = (inode_in->i_ino == inode_out->i_ino);
+
+       /* Don't reflink dirs, pipes, sockets... */
+       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+               return -EISDIR;
+       if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+               return -EINVAL;
+       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+               return -EINVAL;
+
+       /* Don't share DAX file data for now. */
+       if (IS_DAX(inode_in) || IS_DAX(inode_out))
+               return -EINVAL;
+
+       /* Are we going all the way to the end? */
+       isize = i_size_read(inode_in);
+       if (isize == 0)
+               return 0;
+       if (len == 0)
+               len = isize - pos_in;
+
+       /* Ensure offsets don't wrap and the input is inside i_size */
+       if (pos_in + len < pos_in || pos_out + len < pos_out ||
+           pos_in + len > isize)
+               return -EINVAL;
+
+       /* Don't allow dedupe past EOF in the dest file */
+       if (is_dedupe) {
+               loff_t  disize;
+
+               disize = i_size_read(inode_out);
+               if (pos_out >= disize || pos_out + len > disize)
+                       return -EINVAL;
+       }
+
+       /* If we're linking to EOF, continue to the block boundary. */
+       if (pos_in + len == isize)
+               blen = ALIGN(isize, bs) - pos_in;
+       else
+               blen = len;
+
+       /* Only reflink if we're aligned to block boundaries */
+       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+               return -EINVAL;
+
+       /* Don't allow overlapped reflink within the same file */
+       if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
+               return -EINVAL;
+
+       /* Wait for the completion of any pending IOs on srcfile */
+       ret = xfs_file_wait_for_io(inode_in, pos_in, len);
+       if (ret)
+               goto out;
+       ret = xfs_file_wait_for_io(inode_out, pos_out, len);
+       if (ret)
+               goto out;
+
+       if (is_dedupe)
+               flags |= XFS_REFLINK_DEDUPE;
+       ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out),
+                       pos_out, len, flags);
+       if (ret < 0)
+               goto out;
+
+out:
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_copy_range(
+       struct file     *file_in,
+       loff_t          pos_in,
+       struct file     *file_out,
+       loff_t          pos_out,
+       size_t          len,
+       unsigned int    flags)
+{
+       int             error;
+
+       error = xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+                                    len, false);
+       if (error)
+               return error;
+       return len;
+}
+
+STATIC int
+xfs_file_clone_range(
+       struct file     *file_in,
+       loff_t          pos_in,
+       struct file     *file_out,
+       loff_t          pos_out,
+       u64             len)
+{
+       return xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+                                    len, false);
+}
+
+#define XFS_MAX_DEDUPE_LEN     (16 * 1024 * 1024)
+STATIC ssize_t
+xfs_file_dedupe_range(
+       struct file     *src_file,
+       u64             loff,
+       u64             len,
+       struct file     *dst_file,
+       u64             dst_loff)
+{
+       int             error;
+
+       /*
+        * Limit the total length we will dedupe for each operation.
+        * This is intended to bound the total time spent in this
+        * ioctl to something sane.
+        */
+       if (len > XFS_MAX_DEDUPE_LEN)
+               len = XFS_MAX_DEDUPE_LEN;
+
+       error = xfs_file_share_range(src_file, loff, dst_file, dst_loff,
+                                    len, true);
+       if (error)
+               return error;
+       return len;
+}
  
  STATIC int
  xfs_file_open(
@@ -1513,7 +1640,7 @@ xfs_filemap_page_mkwrite(
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (IS_DAX(inode)) {
-               ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+               ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
         } else {
                 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                 ret = block_page_mkwrite_return(ret);
@@ -1547,7 +1674,7 @@ xfs_filemap_fault(
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
-               ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+               ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1652,7 +1779,7 @@ const struct file_operations xfs_file_operations = {
         .llseek         = xfs_file_llseek,
         .read_iter      = xfs_file_read_iter,
         .write_iter     = xfs_file_write_iter,
-       .splice_read    = xfs_file_splice_read,
+       .splice_read    = generic_file_splice_read,
         .splice_write   = iter_file_splice_write,
         .unlocked_ioctl = xfs_file_ioctl,
  #ifdef CONFIG_COMPAT
@@ -1662,7 +1789,11 @@ const struct file_operations xfs_file_operations = {
         .open           = xfs_file_open,
         .release        = xfs_file_release,
         .fsync          = xfs_file_fsync,
+       .get_unmapped_area = thp_get_unmapped_area,
         .fallocate      = xfs_file_fallocate,
+       .copy_file_range = xfs_file_copy_range,
+       .clone_file_range = xfs_file_clone_range,
+       .dedupe_file_range = xfs_file_dedupe_range,
  };
  
  const struct file_operations xfs_dir_file_operations = {