Merge tag 'nfs-for-4.5-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Jan 2016 00:08:23 +0000 (16:08 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Jan 2016 00:08:23 +0000 (16:08 -0800)
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable fixes:
   - Fix a regression in the SunRPC socket polling code
   - Fix the attribute cache revalidation code
   - Fix race in __update_open_stateid()
   - Fix an lo->plh_block_lgets imbalance in layoutreturn
   - Fix an Oopsable typo in ff_mirror_match_fh()

  Features:
   - pNFS layout recall performance improvements.
   - pNFS/flexfiles: Support server-supplied layoutstats sampling period

  Bugfixes + cleanups:
   - NFSv4: Don't perform cached access checks before we've OPENed the
     file
   - Fix starvation issues with background flushes
   - Reclaim writes should be flushed as unstable writes if there are
     already entries in the commit lists
   - Various bugfixes from Chuck to fix NFS/RDMA send queue ordering
     problems
   - Ensure that we propagate fatal layoutget errors back to the
     application
   - Fixes for sundry flexfiles layoutstats bugs
   - Fix files/flexfiles to not cache invalidated layouts in the DS
     commit buckets"

* tag 'nfs-for-4.5-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (68 commits)
  NFS: Fix a compile warning about unused variable in nfs_generic_pg_pgios()
  NFSv4: Fix a compile warning about no prototype for nfs4_ioctl()
  NFS: Use wait_on_atomic_t() for unlock after readahead
  SUNRPC: Fixup socket wait for memory
  NFSv4.1/pNFS: Cleanup constify struct pnfs_layout_range arguments
  NFSv4.1/pnfs: Cleanup copying of pnfs_layout_range structures
  NFSv4.1/pNFS: Cleanup pnfs_mark_matching_lsegs_invalid()
  NFSv4.1/pNFS: Fix a race in initiate_file_draining()
  NFSv4.1/pNFS: pnfs_error_mark_layout_for_return() must always return layout
  NFSv4.1/pNFS: pnfs_mark_matching_lsegs_return() should set the iomode
  NFSv4.1/pNFS: Use nfs4_stateid_copy for copying stateids
  NFSv4.1/pNFS: Don't pass stateids by value to pnfs_send_layoutreturn()
  NFS: Relax requirements in nfs_flush_incompatible
  NFSv4.1/pNFS: Don't queue up a new commit if the layout segment is invalid
  NFS: Allow multiple commit requests in flight per file
  NFS/pNFS: Fix up pNFS write reschedule layering violations and bugs
  SUNRPC: Fix a missing break in rpc_anyaddr()
  pNFS/flexfiles: Fix an Oopsable typo in ff_mirror_match_fh()
  NFS: Fix attribute cache revalidation
  NFS: Ensure we revalidate attributes before using execute_ok()
  ...

36 files changed:
fs/nfs/callback_proc.c
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs42proc.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4sysctl.c
fs/nfs/nfs4trace.c
fs/nfs/nfs4trace.h
fs/nfs/nfstrace.h
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_nfs.c
fs/nfs/read.c
fs/nfs/write.c
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
net/sunrpc/clnt.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/physical_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c

index 807eb6e..f0939d0 100644 (file)
@@ -83,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
 
        res = htonl(NFS4ERR_BADHANDLE);
        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
-       if (inode == NULL)
+       if (inode == NULL) {
+               trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+                               &args->stateid, -ntohl(res));
                goto out;
+       }
        /* Set up a helper thread to actually return the delegation */
        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
        case 0:
@@ -96,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
        default:
                res = htonl(NFS4ERR_RESOURCE);
        }
-       trace_nfs4_recall_delegation(inode, -ntohl(res));
+       trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+                       &args->stateid, -ntohl(res));
        iput(inode);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -160,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
        return lo;
 }
 
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+                                       const nfs4_stateid *new)
+{
+       u32 oldseq, newseq;
+
+       oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+       newseq = be32_to_cpu(new->seqid);
+
+       if (newseq > oldseq + 1)
+               return false;
+       return true;
+}
+
 static u32 initiate_file_draining(struct nfs_client *clp,
                                  struct cb_layoutrecallargs *args)
 {
@@ -169,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        LIST_HEAD(free_me_list);
 
        lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
-       if (!lo)
+       if (!lo) {
+               trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
+                               &args->cbl_stateid, -rv);
                goto out;
+       }
 
        ino = lo->plh_inode;
 
        spin_lock(&ino->i_lock);
+       if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
+               rv = NFS4ERR_DELAY;
+               goto unlock;
+       }
        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
        spin_unlock(&ino->i_lock);
 
        pnfs_layoutcommit_inode(ino, false);
 
        spin_lock(&ino->i_lock);
-       if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-           pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
-                                       &args->cbl_range)) {
+       /*
+        * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+        */
+       if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
                rv = NFS4ERR_DELAY;
                goto unlock;
        }
 
+       if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+                                       &args->cbl_range)) {
+               rv = NFS4_OK;
+               goto unlock;
+       }
+
        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
                        &args->cbl_range);
        }
+       pnfs_mark_layout_returned_if_empty(lo);
 unlock:
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me_list);
+       /* Free all lsegs that are attached to commit buckets */
+       nfs_commit_inode(ino, 0);
        pnfs_put_layout_hdr(lo);
-       trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
+       trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+                       &args->cbl_stateid, -rv);
        iput(ino);
 out:
        return rv;
index 8a05309..c82a212 100644 (file)
@@ -2431,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
 }
 EXPORT_SYMBOL_GPL(nfs_may_open);
 
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+       struct nfs_server *server = NFS_SERVER(inode);
+       int ret;
+
+       if (mask & MAY_NOT_BLOCK)
+               ret = nfs_revalidate_inode_rcu(server, inode);
+       else
+               ret = nfs_revalidate_inode(server, inode);
+       if (ret == 0 && !execute_ok(inode))
+               ret = -EACCES;
+       return ret;
+}
+
 int nfs_permission(struct inode *inode, int mask)
 {
        struct rpc_cred *cred;
@@ -2448,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask)
                case S_IFLNK:
                        goto out;
                case S_IFREG:
+                       if ((mask & MAY_OPEN) &&
+                          nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+                               return 0;
                        break;
                case S_IFDIR:
                        /*
@@ -2480,8 +2497,8 @@ force_lookup:
                        res = PTR_ERR(cred);
        }
 out:
-       if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
-               res = -EACCES;
+       if (!res && (mask & MAY_EXEC))
+               res = nfs_execute_ok(inode, mask);
 
        dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
                inode->i_sb->s_id, inode->i_ino, mask, res);
index 4b1d08f..7ab7ec9 100644 (file)
@@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
        return atomic_dec_and_test(&dreq->io_count);
 }
 
-void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
-{
-       dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-}
-EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
-
 static void
 nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
 {
@@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
        req = nfs_list_entry(reqs.next);
        nfs_direct_setup_mirroring(dreq, &desc, req);
+       if (desc.pg_error < 0) {
+               list_splice_init(&reqs, &failed);
+               goto out_failed;
+       }
 
        list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
                if (!nfs_pageio_add_request(&desc, req)) {
@@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
                        nfs_list_add_request(req, &failed);
                        spin_lock(cinfo.lock);
                        dreq->flags = 0;
-                       dreq->error = -EIO;
+                       if (desc.pg_error < 0)
+                               dreq->error = desc.pg_error;
+                       else
+                               dreq->error = -EIO;
                        spin_unlock(cinfo.lock);
                }
                nfs_release_request(req);
        }
        nfs_pageio_complete(&desc);
 
+out_failed:
        while (!list_empty(&failed)) {
                req = nfs_list_entry(failed.next);
                nfs_list_remove_request(req);
@@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
                nfs_direct_write_complete(dreq, data->inode);
 }
 
-static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+               struct nfs_page *req)
 {
-       /* There is no lock to clear */
+       struct nfs_direct_req *dreq = cinfo->dreq;
+
+       spin_lock(&dreq->lock);
+       dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+       spin_unlock(&dreq->lock);
+       nfs_mark_request_commit(req, NULL, cinfo, 0);
 }
 
 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
        .completion = nfs_direct_commit_complete,
-       .error_cleanup = nfs_direct_error_cleanup,
+       .resched_write = nfs_direct_resched_write,
 };
 
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
@@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
        }
 }
 
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+       struct nfs_direct_req *dreq = hdr->dreq;
+
+       spin_lock(&dreq->lock);
+       if (dreq->error == 0) {
+               dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+               /* fake unstable write to let common nfs resend pages */
+               hdr->verf.committed = NFS_UNSTABLE;
+               hdr->good_bytes = hdr->args.count;
+       }
+       spin_unlock(&dreq->lock);
+}
+
 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
        .error_cleanup = nfs_write_sync_pgio_error,
        .init_hdr = nfs_direct_pgio_init,
        .completion = nfs_direct_write_completion,
+       .reschedule_io = nfs_direct_write_reschedule_io,
 };
 
 
@@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
                        }
 
                        nfs_direct_setup_mirroring(dreq, &desc, req);
+                       if (desc.pg_error < 0) {
+                               nfs_free_request(req);
+                               result = desc.pg_error;
+                               break;
+                       }
 
                        nfs_lock_request(req);
                        req->wb_index = pos >> PAGE_SHIFT;
index 93e2364..4ef8f5a 100644 (file)
@@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page,
         * so it will not block due to pages that will shortly be freeable.
         */
        nfsi = NFS_I(mapping->host);
-       if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+       if (atomic_read(&nfsi->commit_info.rpcs_out)) {
                *writeback = true;
                return;
        }
@@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page)
                inode->i_ino, (long long)page_offset(page));
 
        nfs_fscache_wait_on_page_write(nfsi, page);
-       return nfs_wb_page(inode, page);
+       return nfs_wb_launder_page(inode, page);
 }
 
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -756,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 
        l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
        if (!IS_ERR(l_ctx)) {
-               status = nfs_iocounter_wait(&l_ctx->io_count);
+               status = nfs_iocounter_wait(l_ctx);
                nfs_put_lock_context(l_ctx);
                if (status < 0)
                        return status;
index 02ec079..bb1f4e7 100644 (file)
@@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
                        task->tk_status);
                nfs4_mark_deviceid_unavailable(devid);
                pnfs_error_mark_layout_for_return(inode, lseg);
+               pnfs_set_lo_fail(lseg);
                rpc_wake_up(&tbl->slot_tbl_waitq);
                /* fall through */
        default:
@@ -883,13 +884,19 @@ static void
 filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                        struct nfs_page *req)
 {
-       if (!pgio->pg_lseg)
+       if (!pgio->pg_lseg) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                           req->wb_context,
                                           0,
                                           NFS4_MAX_UINT64,
                                           IOMODE_READ,
                                           GFP_KERNEL);
+               if (IS_ERR(pgio->pg_lseg)) {
+                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+                       pgio->pg_lseg = NULL;
+                       return;
+               }
+       }
        /* If no lseg, fall back to read through mds */
        if (pgio->pg_lseg == NULL)
                nfs_pageio_reset_read_mds(pgio);
@@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
        struct nfs_commit_info cinfo;
        int status;
 
-       if (!pgio->pg_lseg)
+       if (!pgio->pg_lseg) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                           req->wb_context,
                                           0,
                                           NFS4_MAX_UINT64,
                                           IOMODE_RW,
                                           GFP_NOFS);
+               if (IS_ERR(pgio->pg_lseg)) {
+                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+                       pgio->pg_lseg = NULL;
+                       return;
+               }
+       }
+
        /* If no lseg, fall back to write through mds */
        if (pgio->pg_lseg == NULL)
                goto out_mds;
index 03516c8..6594e9f 100644 (file)
@@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
                return false;
        for (i = 0; i < m1->fh_versions_cnt; i++) {
                bool found_fh = false;
-               for (j = 0; j < m2->fh_versions_cnt; i++) {
+               for (j = 0; j < m2->fh_versions_cnt; j++) {
                        if (nfs_compare_fh(&m1->fh_versions[i],
                                        &m2->fh_versions[j]) == 0) {
                                found_fh = true;
@@ -505,9 +505,17 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
        }
 
        p = xdr_inline_decode(&stream, 4);
-       if (p)
-               fls->flags = be32_to_cpup(p);
+       if (!p)
+               goto out_sort_mirrors;
+       fls->flags = be32_to_cpup(p);
+
+       p = xdr_inline_decode(&stream, 4);
+       if (!p)
+               goto out_sort_mirrors;
+       for (i=0; i < fls->mirror_array_cnt; i++)
+               fls->mirror_array[i]->report_interval = be32_to_cpup(p);
 
+out_sort_mirrors:
        ff_layout_sort_mirrors(fls);
        rc = ff_layout_check_layout(lgr);
        if (rc)
@@ -603,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
                mirror->start_time = now;
        if (ktime_equal(mirror->last_report_time, notime))
                mirror->last_report_time = now;
-       if (layoutstats_timer != 0)
+       if (mirror->report_interval != 0)
+               report_interval = (s64)mirror->report_interval * 1000LL;
+       else if (layoutstats_timer != 0)
                report_interval = (s64)layoutstats_timer * 1000LL;
        if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
                        report_interval) {
@@ -785,13 +795,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        int ds_idx;
 
        /* Use full layout for now */
-       if (!pgio->pg_lseg)
+       if (!pgio->pg_lseg) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   req->wb_context,
                                                   0,
                                                   NFS4_MAX_UINT64,
                                                   IOMODE_READ,
                                                   GFP_KERNEL);
+               if (IS_ERR(pgio->pg_lseg)) {
+                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+                       pgio->pg_lseg = NULL;
+                       return;
+               }
+       }
        /* If no lseg, fall back to read through mds */
        if (pgio->pg_lseg == NULL)
                goto out_mds;
@@ -825,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
        int i;
        int status;
 
-       if (!pgio->pg_lseg)
+       if (!pgio->pg_lseg) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   req->wb_context,
                                                   0,
                                                   NFS4_MAX_UINT64,
                                                   IOMODE_RW,
                                                   GFP_NOFS);
+               if (IS_ERR(pgio->pg_lseg)) {
+                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+                       pgio->pg_lseg = NULL;
+                       return;
+               }
+       }
        /* If no lseg, fall back to write through mds */
        if (pgio->pg_lseg == NULL)
                goto out_mds;
@@ -867,18 +889,25 @@ static unsigned int
 ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
                                    struct nfs_page *req)
 {
-       if (!pgio->pg_lseg)
+       if (!pgio->pg_lseg) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   req->wb_context,
                                                   0,
                                                   NFS4_MAX_UINT64,
                                                   IOMODE_RW,
                                                   GFP_NOFS);
+               if (IS_ERR(pgio->pg_lseg)) {
+                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+                       pgio->pg_lseg = NULL;
+                       goto out;
+               }
+       }
        if (pgio->pg_lseg)
                return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
        /* no lseg means that pnfs is not in use, so no mirroring here */
        nfs_pageio_reset_write_mds(pgio);
+out:
        return 1;
 }
 
@@ -912,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
                        hdr->args.count,
                        (unsigned long long)hdr->args.offset);
 
-               if (!hdr->dreq) {
-                       struct nfs_open_context *ctx;
-
-                       ctx = nfs_list_entry(hdr->pages.next)->wb_context;
-                       set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
-                       hdr->completion_ops->error_cleanup(&hdr->pages);
-               } else {
-                       nfs_direct_set_resched_writes(hdr->dreq);
-                       /* fake unstable write to let common nfs resend pages */
-                       hdr->verf.committed = NFS_UNSTABLE;
-                       hdr->good_bytes = hdr->args.count;
-               }
+               hdr->completion_ops->reschedule_io(hdr);
                return;
        }
 
@@ -1101,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
        return -NFS4ERR_RESET_TO_PNFS;
 out_retry:
        task->tk_status = 0;
-       rpc_restart_call(task);
+       rpc_restart_call_prepare(task);
        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
        return -EAGAIN;
 }
@@ -1159,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
                }
        }
 
+       switch (status) {
+       case NFS4ERR_DELAY:
+       case NFS4ERR_GRACE:
+               return;
+       default:
+               break;
+       }
+
        mirror = FF_LAYOUT_COMP(lseg, idx);
        err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
                                       mirror, offset, length, status, opnum,
@@ -1242,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
        return ff_layout_test_devid_unavailable(node);
 }
 
-static int ff_layout_read_prepare_common(struct rpc_task *task,
-                                        struct nfs_pgio_header *hdr)
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+               struct nfs_pgio_header *hdr)
 {
+       if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+               return;
        nfs4_ff_layout_stat_io_start_read(hdr->inode,
                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
                        hdr->args.count,
                        task->tk_start);
+}
+
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+               struct nfs_pgio_header *hdr)
+{
+       if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+               return;
+       nfs4_ff_layout_stat_io_end_read(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count,
+                       hdr->res.count);
+}
 
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+                                        struct nfs_pgio_header *hdr)
+{
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -1265,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
        }
        hdr->pgio_done_cb = ff_layout_read_done_cb;
 
+       ff_layout_read_record_layoutstats_start(task, hdr);
        return 0;
 }
 
@@ -1323,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
-       nfs4_ff_layout_stat_io_end_read(task,
-                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-                       hdr->args.count, hdr->res.count);
-
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1341,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
+       ff_layout_read_record_layoutstats_done(task, hdr);
        rpc_count_iostats_metrics(task,
            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
 }
 
+static void ff_layout_read_release(void *data)
+{
+       struct nfs_pgio_header *hdr = data;
+
+       ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+       pnfs_generic_rw_release(data);
+}
+
+
 static int ff_layout_write_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
@@ -1362,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 
        switch (err) {
        case -NFS4ERR_RESET_TO_PNFS:
-               pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
                ff_layout_reset_write(hdr, true);
                return task->tk_status;
        case -NFS4ERR_RESET_TO_MDS:
-               pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
                ff_layout_reset_write(hdr, false);
                return task->tk_status;
        case -EAGAIN:
-               rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
 
@@ -1402,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 
        switch (err) {
        case -NFS4ERR_RESET_TO_PNFS:
-               pnfs_set_retry_layoutget(data->lseg->pls_layout);
                pnfs_generic_prepare_to_resend_writes(data);
                return -EAGAIN;
        case -NFS4ERR_RESET_TO_MDS:
-               pnfs_clear_retry_layoutget(data->lseg->pls_layout);
                pnfs_generic_prepare_to_resend_writes(data);
                return -EAGAIN;
        case -EAGAIN:
@@ -1421,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
        return 0;
 }
 
-static int ff_layout_write_prepare_common(struct rpc_task *task,
-                                         struct nfs_pgio_header *hdr)
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+               struct nfs_pgio_header *hdr)
 {
+       if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+               return;
        nfs4_ff_layout_stat_io_start_write(hdr->inode,
                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
                        hdr->args.count,
                        task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+               struct nfs_pgio_header *hdr)
+{
+       if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+               return;
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count, hdr->res.count,
+                       hdr->res.verf->committed);
+}
 
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+                                         struct nfs_pgio_header *hdr)
+{
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -1445,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
                return -EAGAIN;
        }
 
+       ff_layout_write_record_layoutstats_start(task, hdr);
        return 0;
 }
 
@@ -1480,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
-       nfs4_ff_layout_stat_io_end_write(task,
-                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-                       hdr->args.count, hdr->res.count,
-                       hdr->res.verf->committed);
-
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1499,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
+       ff_layout_write_record_layoutstats_done(task, hdr);
        rpc_count_iostats_metrics(task,
            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
 }
 
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
+static void ff_layout_write_release(void *data)
+{
+       struct nfs_pgio_header *hdr = data;
+
+       ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+       pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
                struct nfs_commit_data *cdata)
 {
+       if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+               return;
        nfs4_ff_layout_stat_io_start_write(cdata->inode,
                        FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
                        0, task->tk_start);
 }
 
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+               struct nfs_commit_data *cdata)
+{
+       struct nfs_page *req;
+       __u64 count = 0;
+
+       if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+               return;
+
+       if (task->tk_status == 0) {
+               list_for_each_entry(req, &cdata->pages, wb_list)
+                       count += req->wb_bytes;
+       }
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+                       count, count, NFS_FILE_SYNC);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+               struct nfs_commit_data *cdata)
+{
+       ff_layout_commit_record_layoutstats_start(task, cdata);
+}
+
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 {
        ff_layout_commit_prepare_common(task, data);
@@ -1531,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
 
 static void ff_layout_commit_done(struct rpc_task *task, void *data)
 {
-       struct nfs_commit_data *cdata = data;
-       struct nfs_page *req;
-       __u64 count = 0;
-
-       if (task->tk_status == 0) {
-               list_for_each_entry(req, &cdata->pages, wb_list)
-                       count += req->wb_bytes;
-       }
-
-       nfs4_ff_layout_stat_io_end_write(task,
-                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
-                       count, count, NFS_FILE_SYNC);
-
        pnfs_generic_write_commit_done(task, data);
 }
 
@@ -1551,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *cdata = data;
 
+       ff_layout_commit_record_layoutstats_done(task, cdata);
        rpc_count_iostats_metrics(task,
            &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
 }
 
+static void ff_layout_commit_release(void *data)
+{
+       struct nfs_commit_data *cdata = data;
+
+       ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+       pnfs_generic_commit_release(data);
+}
+
 static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
        .rpc_call_prepare = ff_layout_read_prepare_v3,
        .rpc_call_done = ff_layout_read_call_done,
        .rpc_count_stats = ff_layout_read_count_stats,
-       .rpc_release = pnfs_generic_rw_release,
+       .rpc_release = ff_layout_read_release,
 };
 
 static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
        .rpc_call_prepare = ff_layout_read_prepare_v4,
        .rpc_call_done = ff_layout_read_call_done,
        .rpc_count_stats = ff_layout_read_count_stats,
-       .rpc_release = pnfs_generic_rw_release,
+       .rpc_release = ff_layout_read_release,
 };
 
 static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
        .rpc_call_prepare = ff_layout_write_prepare_v3,
        .rpc_call_done = ff_layout_write_call_done,
        .rpc_count_stats = ff_layout_write_count_stats,
-       .rpc_release = pnfs_generic_rw_release,
+       .rpc_release = ff_layout_write_release,
 };
 
 static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
        .rpc_call_prepare = ff_layout_write_prepare_v4,
        .rpc_call_done = ff_layout_write_call_done,
        .rpc_count_stats = ff_layout_write_count_stats,
-       .rpc_release = pnfs_generic_rw_release,
+       .rpc_release = ff_layout_write_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v3,
        .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
-       .rpc_release = pnfs_generic_commit_release,
+       .rpc_release = ff_layout_commit_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v4,
        .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
-       .rpc_release = pnfs_generic_commit_release,
+       .rpc_release = ff_layout_commit_release,
 };
 
 static enum pnfs_try_status
index 2bb08bc..dd353bb 100644 (file)
@@ -85,6 +85,7 @@ struct nfs4_ff_layout_mirror {
        struct nfs4_ff_layoutstat       write_stat;
        ktime_t                         start_time;
        ktime_t                         last_report_time;
+       u32                             report_interval;
 };
 
 struct nfs4_ff_layout_segment {
index e125e55..bd03275 100644 (file)
@@ -429,22 +429,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                                         mirror, lseg->pls_range.offset,
                                         lseg->pls_range.length, NFS4ERR_NXIO,
                                         OP_ILLEGAL, GFP_NOIO);
-               if (fail_return) {
-                       pnfs_error_mark_layout_for_return(ino, lseg);
-                       if (ff_layout_has_available_ds(lseg))
-                               pnfs_set_retry_layoutget(lseg->pls_layout);
-                       else
-                               pnfs_clear_retry_layoutget(lseg->pls_layout);
-
-               } else {
+               if (!fail_return) {
                        if (ff_layout_has_available_ds(lseg))
                                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
                                        &lseg->pls_layout->plh_flags);
-                       else {
+                       else
                                pnfs_error_mark_layout_for_return(ino, lseg);
-                               pnfs_clear_retry_layoutget(lseg->pls_layout);
-                       }
-               }
+               } else
+                       pnfs_error_mark_layout_for_return(ino, lseg);
        }
 out_update_creds:
        if (ff_layout_update_mirror_cred(mirror, ds))
index bdb4dc7..c11e855 100644 (file)
@@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
        return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-/**
- * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
- * @word: long word containing the bit lock
- */
-int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+static int nfs_wait_killable(int mode)
 {
        freezable_schedule_unsafe();
        if (signal_pending_state(mode, current))
                return -ERESTARTSYS;
        return 0;
 }
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+       return nfs_wait_killable(mode);
+}
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
+int nfs_wait_atomic_killable(atomic_t *p)
+{
+       return nfs_wait_killable(TASK_KILLABLE);
+}
+
 /**
  * nfs_compat_user_ino64 - returns the user-visible inode number
  * @fileid: 64-bit fileid
@@ -700,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
        l_ctx->lockowner.l_owner = current->files;
        l_ctx->lockowner.l_pid = current->tgid;
        INIT_LIST_HEAD(&l_ctx->list);
-       nfs_iocounter_init(&l_ctx->io_count);
+       atomic_set(&l_ctx->io_count, 0);
 }
 
 static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
@@ -913,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp)
        if (ctx) {
                struct inode *inode = d_inode(ctx->dentry);
 
+               /*
+                * We fatal error on write before. Try to writeback
+                * every page again.
+                */
+               if (ctx->error < 0)
+                       invalidate_inode_pages2(inode->i_mapping);
                filp->private_data = NULL;
                spin_lock(&inode->i_lock);
                list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -1663,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        unsigned long invalid = 0;
        unsigned long now = jiffies;
        unsigned long save_cache_validity;
+       bool cache_revalidated = true;
 
        dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
                        __func__, inode->i_sb->s_id, inode->i_ino,
@@ -1724,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                nfs_force_lookup_revalidate(inode);
                        inode->i_version = fattr->change_attr;
                }
-       } else
+       } else {
                nfsi->cache_validity |= save_cache_validity;
+               cache_revalidated = false;
+       }
 
        if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
                memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-       } else if (server->caps & NFS_CAP_MTIME)
+       } else if (server->caps & NFS_CAP_MTIME) {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATTR
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
        if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
                memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-       } else if (server->caps & NFS_CAP_CTIME)
+       } else if (server->caps & NFS_CAP_CTIME) {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATTR
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
        /* Check if our cached file size is stale */
        if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1759,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                        (long long)cur_isize,
                                        (long long)new_isize);
                }
-       } else
+       } else {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATTR
                                | NFS_INO_REVAL_PAGECACHE
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
 
        if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-       else if (server->caps & NFS_CAP_ATIME)
+       else if (server->caps & NFS_CAP_ATIME) {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATIME
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
        if (fattr->valid & NFS_ATTR_FATTR_MODE) {
                if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
@@ -1780,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        inode->i_mode = newmode;
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                }
-       } else if (server->caps & NFS_CAP_MODE)
+       } else if (server->caps & NFS_CAP_MODE) {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATTR
                                | NFS_INO_INVALID_ACCESS
                                | NFS_INO_INVALID_ACL
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
        if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
                if (!uid_eq(inode->i_uid, fattr->uid)) {
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                        inode->i_uid = fattr->uid;
                }
-       } else if (server->caps & NFS_CAP_OWNER)
+       } else if (server->caps & NFS_CAP_OWNER) {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATTR
                                | NFS_INO_INVALID_ACCESS
                                | NFS_INO_INVALID_ACL
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
        if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
                if (!gid_eq(inode->i_gid, fattr->gid)) {
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                        inode->i_gid = fattr->gid;
                }
-       } else if (server->caps & NFS_CAP_OWNER_GROUP)
+       } else if (server->caps & NFS_CAP_OWNER_GROUP) {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATTR
                                | NFS_INO_INVALID_ACCESS
                                | NFS_INO_INVALID_ACL
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
        if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
                if (inode->i_nlink != fattr->nlink) {
@@ -1818,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                invalid |= NFS_INO_INVALID_DATA;
                        set_nlink(inode, fattr->nlink);
                }
-       } else if (server->caps & NFS_CAP_NLINK)
+       } else if (server->caps & NFS_CAP_NLINK) {
                nfsi->cache_validity |= save_cache_validity &
                                (NFS_INO_INVALID_ATTR
                                | NFS_INO_REVAL_FORCED);
+               cache_revalidated = false;
+       }
 
        if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                /*
                 * report the blocks in 512byte units
                 */
                inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-       }
-       if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+       } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
                inode->i_blocks = fattr->du.nfs2.blocks;
+       else
+               cache_revalidated = false;
 
        /* Update attrtimeo value if we're out of the unstable period */
        if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1840,9 +1872,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                /* Set barrier to be more recent than all outstanding updates */
                nfsi->attr_gencount = nfs_inc_attr_generation_counter();
        } else {
-               if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
-                       if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
-                               nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+               if (cache_revalidated) {
+                       if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+                               nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+                               nfsi->attrtimeo <<= 1;
+                               if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+                                       nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+                       }
                        nfsi->attrtimeo_timestamp = now;
                }
                /* Set the barrier to be more recent than this fattr */
@@ -1851,7 +1887,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        }
 
        /* Don't declare attrcache up to date if there were no attrs! */
-       if (fattr->valid != 0)
+       if (cache_revalidated)
                invalid &= ~NFS_INO_INVALID_ATTR;
 
        /* Don't invalidate the data if we were to blame */
index 9dea85f..4e8cc94 100644 (file)
@@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
                              struct nfs_pgio_header *hdr,
                              void (*release)(struct nfs_pgio_header *hdr));
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
-int nfs_iocounter_wait(struct nfs_io_counter *c);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
 
 extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
 struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
@@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req);
 struct nfs_pgio_mirror *
 nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
 
-static inline void nfs_iocounter_init(struct nfs_io_counter *c)
-{
-       c->flags = 0;
-       atomic_set(&c->io_count, 0);
-}
-
 static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
 {
        WARN_ON_ONCE(desc->pg_mirror_count < 1);
        return desc->pg_mirror_count > 1;
 }
 
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+               const struct nfs_open_context *ctx2)
+{
+       return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
 /* nfs2xdr.c */
 extern struct rpc_procinfo nfs_procedures[];
 extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -380,6 +380,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern int nfs_wait_atomic_killable(atomic_t *p);
 
 /* super.c */
 extern const struct super_operations nfs_sops;
@@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
        inode_dio_wait(inode);
 }
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
-extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
 {
        return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
 }
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+       return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+                               NFS4_STATEID_OTHER_SIZE);
+}
 #else
 static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
 {
        return 0;
 }
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+       return 0;
+}
 #endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+       switch (err) {
+       case -ERESTARTSYS:
+       case -EIO:
+       case -ENOSPC:
+       case -EROFS:
+       case -E2BIG:
+               return true;
+       default:
+               return false;
+       }
+}
index 6b1ce98..6e81749 100644 (file)
@@ -204,6 +204,8 @@ static void
 nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 {
        struct nfs42_layoutstat_data *data = calldata;
+       struct inode *inode = data->inode;
+       struct pnfs_layout_hdr *lo;
 
        if (!nfs4_sequence_done(task, &data->res.seq_res))
                return;
@@ -211,12 +213,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case 0:
                break;
+       case -NFS4ERR_EXPIRED:
+       case -NFS4ERR_STALE_STATEID:
+       case -NFS4ERR_OLD_STATEID:
+       case -NFS4ERR_BAD_STATEID:
+               spin_lock(&inode->i_lock);
+               lo = NFS_I(inode)->layout;
+               if (lo && nfs4_stateid_match(&data->args.stateid,
+                                            &lo->plh_stateid)) {
+                       LIST_HEAD(head);
+
+                       /*
+                        * Mark the bad layout state as invalid, then retry
+                        * with the current stateid.
+                        */
+                       set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+                       spin_unlock(&inode->i_lock);
+                       pnfs_free_lseg_list(&head);
+               } else
+                       spin_unlock(&inode->i_lock);
+               break;
        case -ENOTSUPP:
        case -EOPNOTSUPP:
-               NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+               NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
        default:
-               dprintk("%s server returns %d\n", __func__, task->tk_status);
+               break;
        }
+
+       dprintk("%s server returns %d\n", __func__, task->tk_status);
 }
 
 static void
index c57d133..4bfc33a 100644 (file)
@@ -208,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
        | FATTR4_WORD1_TIME_METADATA
        | FATTR4_WORD1_TIME_MODIFY,
        FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+       | FATTR4_WORD2_SECURITY_LABEL
+#endif
 };
 
 static const u32 nfs4_open_noattr_bitmap[3] = {
@@ -1385,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
         * Protect the call to nfs4_state_set_mode_locked and
         * serialise the stateid update
         */
+       spin_lock(&state->owner->so_lock);
        write_seqlock(&state->seqlock);
        if (deleg_stateid != NULL) {
                nfs4_stateid_copy(&state->stateid, deleg_stateid);
@@ -1393,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
        if (open_stateid != NULL)
                nfs_set_open_stateid_locked(state, open_stateid, fmode);
        write_sequnlock(&state->seqlock);
-       spin_lock(&state->owner->so_lock);
        update_open_stateflags(state, fmode);
        spin_unlock(&state->owner->so_lock);
 }
@@ -1598,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 
        if (!data->rpc_done) {
                state = nfs4_try_open_cached(data);
+               trace_nfs4_cached_open(data->state);
                goto out;
        }
 
@@ -2015,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        }
        return;
 unlock_no_action:
+       trace_nfs4_cached_open(data->state);
        rcu_read_unlock();
 out_no_action:
        task->tk_action = NULL;
@@ -2703,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (status == 0 && state != NULL)
                renew_lease(server, timestamp);
+       trace_nfs4_setattr(inode, &arg.stateid, status);
        return status;
 }
 
@@ -2719,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        int err;
        do {
                err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
-               trace_nfs4_setattr(inode, err);
                switch (err) {
                case -NFS4ERR_OPENMODE:
                        if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -5048,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
 static int
 nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 {
-       int result;
        size_t len;
        char *str;
 
@@ -5076,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
                return -ENOMEM;
 
        rcu_read_lock();
-       result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+       scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
                        clp->cl_ipaddr,
                        rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
                        rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
@@ -5089,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 static int
 nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 {
-       int result;
        size_t len;
        char *str;
 
@@ -5109,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
        if (!str)
                return -ENOMEM;
 
-       result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+       scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
                        clp->rpc_ops->version, clp->cl_minorversion,
                        nfs4_client_id_uniquifier,
                        clp->cl_rpcclient->cl_nodename);
@@ -5120,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 static int
 nfs4_init_uniform_client_string(struct nfs_client *clp)
 {
-       int result;
        size_t len;
        char *str;
 
@@ -5145,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
        if (!str)
                return -ENOMEM;
 
-       result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+       scnprintf(str, len, "Linux NFSv%u.%u %s",
                        clp->rpc_ops->version, clp->cl_minorversion,
                        clp->cl_rpcclient->cl_nodename);
        clp->cl_owner_id = str;
@@ -5384,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        if (data == NULL)
                return -ENOMEM;
        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+
+       nfs4_state_protect(server->nfs_client,
+                       NFS_SP4_MACH_CRED_CLEANUP,
+                       &task_setup_data.rpc_client, &msg);
+
        data->args.fhandle = &data->fh;
        data->args.stateid = &data->stateid;
        data->args.bitmask = server->cache_consistency_bitmask;
@@ -5426,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
        int err;
        do {
                err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
-               trace_nfs4_delegreturn(inode, err);
+               trace_nfs4_delegreturn(inode, stateid, err);
                switch (err) {
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
@@ -5936,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                data->cancelled = 1;
        rpc_put_task(task);
        dprintk("%s: done, ret = %d!\n", __func__, ret);
+       trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
        return ret;
 }
 
@@ -5952,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-               trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
@@ -5979,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
-               trace_nfs4_lock_expired(request, state, F_SETLK, err);
                switch (err) {
                default:
                        goto out;
@@ -6087,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
 
        do {
                err = _nfs4_proc_setlk(state, cmd, request);
-               trace_nfs4_set_lock(request, state, cmd, err);
                if (err == -NFS4ERR_DENIED)
                        err = -EAGAIN;
                err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -6847,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
        },
        .allow.u.words = {
                [0] = 1 << (OP_CLOSE) |
+                     1 << (OP_OPEN_DOWNGRADE) |
                      1 << (OP_LOCKU) |
+                     1 << (OP_DELEGRETURN) |
                      1 << (OP_COMMIT),
                [1] = 1 << (OP_SECINFO - 32) |
                      1 << (OP_SECINFO_NO_NAME - 32) |
+                     1 << (OP_LAYOUTRETURN - 32) |
                      1 << (OP_TEST_STATEID - 32) |
                      1 << (OP_FREE_STATEID - 32) |
                      1 << (OP_WRITE - 32)
@@ -6915,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
                }
 
                if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+                   test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+                   test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
                    test_bit(OP_LOCKU, sp->allow.u.longs)) {
                        dfprintk(MOUNT, "  cleanup mode enabled\n");
                        set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
                }
 
+               if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+                       dfprintk(MOUNT, "  pnfs cleanup mode enabled\n");
+                       set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+                               &clp->cl_sp4_flags);
+               }
+
                if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
                    test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
                        dfprintk(MOUNT, "  secinfo mode enabled\n");
@@ -7748,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
        struct nfs4_session *session = nfs4_get_session(server);
+       int ret;
 
        dprintk("--> %s\n", __func__);
        /* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -7758,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
        if (nfs41_setup_sequence(session, &lgp->args.seq_args,
                                &lgp->res.seq_res, task))
                return;
-       if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+       ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
                                          NFS_I(lgp->args.inode)->layout,
                                          &lgp->args.range,
-                                         lgp->args.ctx->state)) {
-               rpc_exit(task, NFS4_OK);
-       }
+                                         lgp->args.ctx->state);
+       if (ret < 0)
+               rpc_exit(task, ret);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -7783,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case 0:
                goto out;
+
+       /*
+        * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+        * on the file. set tk_status to -ENODATA to tell upper layer to
+        * retry go inband.
+        */
+       case -NFS4ERR_LAYOUTUNAVAILABLE:
+               task->tk_status = -ENODATA;
+               goto out;
        /*
         * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
         * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
@@ -7979,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
        trace_nfs4_layoutget(lgp->args.ctx,
                        &lgp->args.range,
                        &lgp->res.range,
+                       &lgp->res.stateid,
                        status);
        /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
        if (status == 0 && lgp->res.layoutp->len)
@@ -8035,11 +8062,11 @@ static void nfs4_layoutreturn_release(void *calldata)
 
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
+       pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+       pnfs_mark_layout_returned_if_empty(lo);
        if (lrp->res.lrs_present)
                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-       pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
        pnfs_clear_layoutreturn_waitbit(lo);
-       lo->plh_block_lgets--;
        spin_unlock(&lo->plh_inode->i_lock);
        pnfs_free_lseg_list(&freeme);
        pnfs_put_layout_hdr(lrp->args.layout);
@@ -8071,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
        };
        int status = 0;
 
+       nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+                       NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+                       &task_setup_data.rpc_client, &msg);
+
        dprintk("--> %s\n", __func__);
        if (!sync) {
                lrp->inode = nfs_igrab_and_active(lrp->args.inode);
@@ -8086,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
                return PTR_ERR(task);
        if (sync)
                status = task->tk_status;
-       trace_nfs4_layoutreturn(lrp->args.inode, status);
+       trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
        dprintk("<-- %s status=%d\n", __func__, status);
        rpc_put_task(task);
        return status;
@@ -8234,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
                return PTR_ERR(task);
        if (sync)
                status = task->tk_status;
-       trace_nfs4_layoutcommit(data->args.inode, status);
+       trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
        dprintk("%s: status %d\n", __func__, status);
        rpc_put_task(task);
        return status;
index 0fbd3ab..8693d77 100644 (file)
@@ -12,7 +12,7 @@
 #include "nfs4idmap.h"
 #include "callback.h"
 
-static const int nfs_set_port_min = 0;
+static const int nfs_set_port_min;
 static const int nfs_set_port_max = 65535;
 static struct ctl_table_header *nfs4_callback_sysctl_table;
 
index d774335..2850bce 100644 (file)
@@ -6,6 +6,7 @@
 #include "internal.h"
 #include "nfs4session.h"
 #include "callback.h"
+#include "pnfs.h"
 
 #define CREATE_TRACE_POINTS
 #include "nfs4trace.h"
index 671cf68..2c8d05d 100644 (file)
@@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done,
                        __entry->highest_slotid = res->sr_highest_slotid;
                        __entry->target_highest_slotid =
                                        res->sr_target_highest_slotid;
+                       __entry->status_flags = res->sr_status_flags;
                        __entry->error = res->sr_status;
                ),
                TP_printk(
@@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
                        __field(u64, fileid)
                        __field(u64, dir)
                        __string(name, ctx->dentry->d_name.name)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
+                       __field(int, openstateid_seq)
+                       __field(u32, openstateid_hash)
                ),
 
                TP_fast_assign(
@@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
                        __entry->flags = flags;
                        __entry->fmode = (__force unsigned int)ctx->mode;
                        __entry->dev = ctx->dentry->d_sb->s_dev;
-                       if (!IS_ERR_OR_NULL(state))
+                       if (!IS_ERR_OR_NULL(state)) {
                                inode = state->inode;
+                               __entry->stateid_seq =
+                                       be32_to_cpu(state->stateid.seqid);
+                               __entry->stateid_hash =
+                                       nfs_stateid_hash(&state->stateid);
+                               __entry->openstateid_seq =
+                                       be32_to_cpu(state->open_stateid.seqid);
+                               __entry->openstateid_hash =
+                                       nfs_stateid_hash(&state->open_stateid);
+                       } else {
+                               __entry->stateid_seq = 0;
+                               __entry->stateid_hash = 0;
+                               __entry->openstateid_seq = 0;
+                               __entry->openstateid_hash = 0;
+                       }
                        if (inode != NULL) {
                                __entry->fileid = NFS_FILEID(inode);
                                __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
                TP_printk(
                        "error=%d (%s) flags=%d (%s) fmode=%s "
                        "fileid=%02x:%02x:%llu fhandle=0x%08x "
-                       "name=%02x:%02x:%llu/%s",
+                       "name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+                       "openstateid=%d:0x%08x",
                         __entry->error,
                         show_nfsv4_errors(__entry->error),
                         __entry->flags,
@@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
                         __entry->fhandle,
                         MAJOR(__entry->dev), MINOR(__entry->dev),
                         (unsigned long long)__entry->dir,
-                        __get_str(name)
+                        __get_str(name),
+                        __entry->stateid_seq, __entry->stateid_hash,
+                        __entry->openstateid_seq, __entry->openstateid_hash
                )
 );
 
@@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
 DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
 DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
 
+TRACE_EVENT(nfs4_cached_open,
+               TP_PROTO(
+                       const struct nfs4_state *state
+               ),
+               TP_ARGS(state),
+               TP_STRUCT__entry(
+                       __field(dev_t, dev)
+                       __field(u32, fhandle)
+                       __field(u64, fileid)
+                       __field(unsigned int, fmode)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
+               ),
+
+               TP_fast_assign(
+                       const struct inode *inode = state->inode;
+
+                       __entry->dev = inode->i_sb->s_dev;
+                       __entry->fileid = NFS_FILEID(inode);
+                       __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+                       __entry->fmode = (__force unsigned int)state->state;
+                       __entry->stateid_seq =
+                               be32_to_cpu(state->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&state->stateid);
+               ),
+
+               TP_printk(
+                       "fmode=%s fileid=%02x:%02x:%llu "
+                       "fhandle=0x%08x stateid=%d:0x%08x",
+                       __entry->fmode ?  show_fmode_flags(__entry->fmode) :
+                                         "closed",
+                       MAJOR(__entry->dev), MINOR(__entry->dev),
+                       (unsigned long long)__entry->fileid,
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash
+               )
+);
+
 TRACE_EVENT(nfs4_close,
                TP_PROTO(
                        const struct nfs4_state *state,
@@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close,
                        __field(u64, fileid)
                        __field(unsigned int, fmode)
                        __field(int, error)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
                ),
 
                TP_fast_assign(
@@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close,
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
                        __entry->fmode = (__force unsigned int)state->state;
                        __entry->error = error;
+                       __entry->stateid_seq =
+                               be32_to_cpu(args->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&args->stateid);
                ),
 
                TP_printk(
                        "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
-                       "fhandle=0x%08x",
+                       "fhandle=0x%08x openstateid=%d:0x%08x",
                        __entry->error,
                        show_nfsv4_errors(__entry->error),
                        __entry->fmode ?  show_fmode_flags(__entry->fmode) :
                                          "closed",
                        MAJOR(__entry->dev), MINOR(__entry->dev),
                        (unsigned long long)__entry->fileid,
-                       __entry->fhandle
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash
                )
 );
 
@@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
                        __field(dev_t, dev)
                        __field(u32, fhandle)
                        __field(u64, fileid)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
                ),
 
                TP_fast_assign(
@@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+                       __entry->stateid_seq =
+                               be32_to_cpu(state->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&state->stateid);
                ),
 
                TP_printk(
                        "error=%d (%s) cmd=%s:%s range=%lld:%lld "
-                       "fileid=%02x:%02x:%llu fhandle=0x%08x",
+                       "fileid=%02x:%02x:%llu fhandle=0x%08x "
+                       "stateid=%d:0x%08x",
                        __entry->error,
                        show_nfsv4_errors(__entry->error),
                        show_lock_cmd(__entry->cmd),
@@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
                        (long long)__entry->end,
                        MAJOR(__entry->dev), MINOR(__entry->dev),
                        (unsigned long long)__entry->fileid,
-                       __entry->fhandle
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash
                )
 );
 
@@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
                        ), \
                        TP_ARGS(request, state, cmd, error))
 DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
 DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
 
+TRACE_EVENT(nfs4_set_lock,
+               TP_PROTO(
+                       const struct file_lock *request,
+                       const struct nfs4_state *state,
+                       const nfs4_stateid *lockstateid,
+                       int cmd,
+                       int error
+               ),
+
+               TP_ARGS(request, state, lockstateid, cmd, error),
+
+               TP_STRUCT__entry(
+                       __field(int, error)
+                       __field(int, cmd)
+                       __field(char, type)
+                       __field(loff_t, start)
+                       __field(loff_t, end)
+                       __field(dev_t, dev)
+                       __field(u32, fhandle)
+                       __field(u64, fileid)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
+                       __field(int, lockstateid_seq)
+                       __field(u32, lockstateid_hash)
+               ),
+
+               TP_fast_assign(
+                       const struct inode *inode = state->inode;
+
+                       __entry->error = error;
+                       __entry->cmd = cmd;
+                       __entry->type = request->fl_type;
+                       __entry->start = request->fl_start;
+                       __entry->end = request->fl_end;
+                       __entry->dev = inode->i_sb->s_dev;
+                       __entry->fileid = NFS_FILEID(inode);
+                       __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+                       __entry->stateid_seq =
+                               be32_to_cpu(state->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&state->stateid);
+                       __entry->lockstateid_seq =
+                               be32_to_cpu(lockstateid->seqid);
+                       __entry->lockstateid_hash =
+                               nfs_stateid_hash(lockstateid);
+               ),
+
+               TP_printk(
+                       "error=%d (%s) cmd=%s:%s range=%lld:%lld "
+                       "fileid=%02x:%02x:%llu fhandle=0x%08x "
+                       "stateid=%d:0x%08x lockstateid=%d:0x%08x",
+                       __entry->error,
+                       show_nfsv4_errors(__entry->error),
+                       show_lock_cmd(__entry->cmd),
+                       show_lock_type(__entry->type),
+                       (long long)__entry->start,
+                       (long long)__entry->end,
+                       MAJOR(__entry->dev), MINOR(__entry->dev),
+                       (unsigned long long)__entry->fileid,
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash,
+                       __entry->lockstateid_seq, __entry->lockstateid_hash
+               )
+);
+
 DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
                TP_PROTO(
                        const struct inode *inode,
@@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit,
                        __field(dev_t, dev)
                        __field(u32, fhandle)
                        __field(int, error)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
                ),
 
                TP_fast_assign(
                        __entry->dev = res->server->s_dev;
                        __entry->fhandle = nfs_fhandle_hash(args->fhandle);
                        __entry->error = error;
+                       __entry->stateid_seq =
+                               be32_to_cpu(args->stateid->seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(args->stateid);
                ),
 
                TP_printk(
-                       "error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+                       "error=%d (%s) dev=%02x:%02x fhandle=0x%08x "
+                       "stateid=%d:0x%08x",
                        __entry->error,
                        show_nfsv4_errors(__entry->error),
                        MAJOR(__entry->dev), MINOR(__entry->dev),
-                       __entry->fhandle
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash
                )
 );
 
@@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
                        __field(dev_t, dev)
                        __field(u32, fhandle)
                        __field(u64, fileid)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
                ),
 
                TP_fast_assign(
@@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+                       __entry->stateid_seq =
+                               be32_to_cpu(state->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&state->stateid);
                ),
 
                TP_printk(
-                       "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+                       "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+                       "stateid=%d:0x%08x",
                        __entry->error,
                        show_nfsv4_errors(__entry->error),
                        MAJOR(__entry->dev), MINOR(__entry->dev),
                        (unsigned long long)__entry->fileid,
-                       __entry->fhandle
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash
                )
 );
 
@@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
                        ), \
                        TP_ARGS(inode, error))
 
-DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
 DEFINE_NFS4_INODE_EVENT(nfs4_access);
 DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
 DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
@@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
 DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
 DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
 #endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
-DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+               TP_PROTO(
+                       const struct inode *inode,
+                       const nfs4_stateid *stateid,
+                       int error
+               ),
+
+               TP_ARGS(inode, stateid, error),
+
+               TP_STRUCT__entry(
+                       __field(dev_t, dev)
+                       __field(u32, fhandle)
+                       __field(u64, fileid)
+                       __field(int, error)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
+               ),
+
+               TP_fast_assign(
+                       __entry->dev = inode->i_sb->s_dev;
+                       __entry->fileid = NFS_FILEID(inode);
+                       __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+                       __entry->error = error;
+                       __entry->stateid_seq =
+                               be32_to_cpu(stateid->seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(stateid);
+               ),
+
+               TP_printk(
+                       "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+                       "stateid=%d:0x%08x",
+                       __entry->error,
+                       show_nfsv4_errors(__entry->error),
+                       MAJOR(__entry->dev), MINOR(__entry->dev),
+                       (unsigned long long)__entry->fileid,
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash
+               )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+       DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+                       TP_PROTO( \
+                               const struct inode *inode, \
+                               const nfs4_stateid *stateid, \
+                               int error \
+                       ), \
+                       TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
 
 DECLARE_EVENT_CLASS(nfs4_getattr_event,
                TP_PROTO(
@@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
                        ), \
                        TP_ARGS(clp, fhandle, inode, error))
 DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
-DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
 
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+               TP_PROTO(
+                       const struct nfs_client *clp,
+                       const struct nfs_fh *fhandle,
+                       const struct inode *inode,
+                       const nfs4_stateid *stateid,
+                       int error
+               ),
+
+               TP_ARGS(clp, fhandle, inode, stateid, error),
+
+               TP_STRUCT__entry(
+                       __field(int, error)
+                       __field(dev_t, dev)
+                       __field(u32, fhandle)
+                       __field(u64, fileid)
+                       __string(dstaddr, clp ?
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                       RPC_DISPLAY_ADDR) : "unknown")
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
+               ),
+
+               TP_fast_assign(
+                       __entry->error = error;
+                       __entry->fhandle = nfs_fhandle_hash(fhandle);
+                       if (inode != NULL) {
+                               __entry->fileid = NFS_FILEID(inode);
+                               __entry->dev = inode->i_sb->s_dev;
+                       } else {
+                               __entry->fileid = 0;
+                               __entry->dev = 0;
+                       }
+                       __assign_str(dstaddr, clp ?
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                       RPC_DISPLAY_ADDR) : "unknown")
+                       __entry->stateid_seq =
+                               be32_to_cpu(stateid->seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(stateid);
+               ),
+
+               TP_printk(
+                       "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+                       "stateid=%d:0x%08x dstaddr=%s",
+                       __entry->error,
+                       show_nfsv4_errors(__entry->error),
+                       MAJOR(__entry->dev), MINOR(__entry->dev),
+                       (unsigned long long)__entry->fileid,
+                       __entry->fhandle,
+                       __entry->stateid_seq, __entry->stateid_hash,
+                       __get_str(dstaddr)
+               )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+       DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+                       TP_PROTO( \
+                               const struct nfs_client *clp, \
+                               const struct nfs_fh *fhandle, \
+                               const struct inode *inode, \
+                               const nfs4_stateid *stateid, \
+                               int error \
+                       ), \
+                       TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
 
 DECLARE_EVENT_CLASS(nfs4_idmap_event,
                TP_PROTO(
@@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
                        __field(loff_t, offset)
                        __field(size_t, count)
                        __field(int, error)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
                ),
 
                TP_fast_assign(
                        const struct inode *inode = hdr->inode;
+                       const struct nfs4_state *state =
+                               hdr->args.context->state;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
                        __entry->offset = hdr->args.offset;
                        __entry->count = hdr->args.count;
                        __entry->error = error;
+                       __entry->stateid_seq =
+                               be32_to_cpu(state->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&state->stateid);
                ),
 
                TP_printk(
                        "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
-                       "offset=%lld count=%zu",
+                       "offset=%lld count=%zu stateid=%d:0x%08x",
                        __entry->error,
                        show_nfsv4_errors(__entry->error),
                        MAJOR(__entry->dev), MINOR(__entry->dev),
                        (unsigned long long)__entry->fileid,
                        __entry->fhandle,
                        (long long)__entry->offset,
-                       __entry->count
+                       __entry->count,
+                       __entry->stateid_seq, __entry->stateid_hash
                )
 );
 #define DEFINE_NFS4_READ_EVENT(name) \
@@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
                        __field(loff_t, offset)
                        __field(size_t, count)
                        __field(int, error)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
                ),
 
                TP_fast_assign(
                        const struct inode *inode = hdr->inode;
+                       const struct nfs4_state *state =
+                               hdr->args.context->state;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
                        __entry->offset = hdr->args.offset;
                        __entry->count = hdr->args.count;
                        __entry->error = error;
+                       __entry->stateid_seq =
+                               be32_to_cpu(state->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&state->stateid);
                ),
 
                TP_printk(
                        "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
-                       "offset=%lld count=%zu",
+                       "offset=%lld count=%zu stateid=%d:0x%08x",
                        __entry->error,
                        show_nfsv4_errors(__entry->error),
                        MAJOR(__entry->dev), MINOR(__entry->dev),
                        (unsigned long long)__entry->fileid,
                        __entry->fhandle,
                        (long long)__entry->offset,
-                       __entry->count
+                       __entry->count,
+                       __entry->stateid_seq, __entry->stateid_hash
                )
 );
 
@@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget,
                        const struct nfs_open_context *ctx,
                        const struct pnfs_layout_range *args,
                        const struct pnfs_layout_range *res,
+                       const nfs4_stateid *layout_stateid,
                        int error
                ),
 
-               TP_ARGS(ctx, args, res, error),
+               TP_ARGS(ctx, args, res, layout_stateid, error),
 
                TP_STRUCT__entry(
                        __field(dev_t, dev)
@@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget,
                        __field(u64, offset)
                        __field(u64, count)
                        __field(int, error)
+                       __field(int, stateid_seq)
+                       __field(u32, stateid_hash)
+                       __field(int, layoutstateid_seq)
+                       __field(u32, layoutstateid_hash)
                ),
 
                TP_fast_assign(
                        const struct inode *inode = d_inode(ctx->dentry);
+                       const struct nfs4_state *state = ctx->state;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget,
                        __entry->offset = args->offset;
                        __entry->count = args->length;
                        __entry->error = error;
+                       __entry->stateid_seq =
+                               be32_to_cpu(state->stateid.seqid);
+                       __entry->stateid_hash =
+                               nfs_stateid_hash(&state->stateid);
+                       if (!error) {
+                               __entry->layoutstateid_seq =
+                               be32_to_cpu(layout_stateid->seqid);
+                               __entry->layoutstateid_hash =
+                               nfs_stateid_hash(layout_stateid);
+                       } else {
+                               __entry->layoutstateid_seq = 0;
+                               __entry->layoutstateid_hash = 0;
+                       }
                ),
 
                TP_printk(
                        "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
-                       "iomode=%s offset=%llu count=%llu",
+                       "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+                       "layoutstateid=%d:0x%08x",
                        __entry->error,
                        show_nfsv4_errors(__entry->error),
                        MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget,
                        __entry->fhandle,
                        show_pnfs_iomode(__entry->iomode),
                        (unsigned long long)__entry->offset,
-                       (unsigned long long)__entry->count
+                       (unsigned long long)__entry->count,
+                       __entry->stateid_seq, __entry->stateid_hash,
+                       __entry->layoutstateid_seq, __entry->layoutstateid_hash
                )
 );
 
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 
+#define show_pnfs_update_layout_reason(reason)                         \
+       __print_symbolic(reason,                                        \
+               { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" },              \
+               { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" },              \
+               { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" },      \
+               { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" },          \
+               { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" },                  \
+               { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" },      \
+               { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" },    \
+               { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" },    \
+               { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" },          \
+               { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },      \
+               { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+
+TRACE_EVENT(pnfs_update_layout,
+               TP_PROTO(struct inode *inode,
+                       loff_t pos,
+                       u64 count,
+                       enum pnfs_iomode iomode,
+                       struct pnfs_layout_hdr *lo,
+                       enum pnfs_update_layout_reason reason
+               ),
+               TP_ARGS(inode, pos, count, iomode, lo, reason),
+               TP_STRUCT__entry(
+                       __field(dev_t, dev)
+                       __field(u64, fileid)
+                       __field(u32, fhandle)
+                       __field(loff_t, pos)
+                       __field(u64, count)
+                       __field(enum pnfs_iomode, iomode)
+                       __field(int, layoutstateid_seq)
+                       __field(u32, layoutstateid_hash)
+                       __field(enum pnfs_update_layout_reason, reason)
+               ),
+               TP_fast_assign(
+                       __entry->dev = inode->i_sb->s_dev;
+                       __entry->fileid = NFS_FILEID(inode);
+                       __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+                       __entry->pos = pos;
+                       __entry->count = count;
+                       __entry->iomode = iomode;
+                       __entry->reason = reason;
+                       if (lo != NULL) {
+                               __entry->layoutstateid_seq =
+                               be32_to_cpu(lo->plh_stateid.seqid);
+                               __entry->layoutstateid_hash =
+                               nfs_stateid_hash(&lo->plh_stateid);
+                       } else {
+                               __entry->layoutstateid_seq = 0;
+                               __entry->layoutstateid_hash = 0;
+                       }
+               ),
+               TP_printk(
+                       "fileid=%02x:%02x:%llu fhandle=0x%08x "
+                       "iomode=%s pos=%llu count=%llu "
+                       "layoutstateid=%d:0x%08x (%s)",
+                       MAJOR(__entry->dev), MINOR(__entry->dev),
+                       (unsigned long long)__entry->fileid,
+                       __entry->fhandle,
+                       show_pnfs_iomode(__entry->iomode),
+                       (unsigned long long)__entry->pos,
+                       (unsigned long long)__entry->count,
+                       __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+                       show_pnfs_update_layout_reason(__entry->reason)
+               )
+);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* _TRACE_NFS4_H */
index 59f838c..9f80a08 100644 (file)
@@ -39,7 +39,6 @@
                        { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
                        { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
                        { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
-                       { 1 << NFS_INO_COMMIT, "COMMIT" }, \
                        { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
                        { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
 
index 452a011..8ce4f61 100644 (file)
@@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p)
        kmem_cache_free(nfs_page_cachep, p);
 }
 
-static void
-nfs_iocounter_inc(struct nfs_io_counter *c)
-{
-       atomic_inc(&c->io_count);
-}
-
-static void
-nfs_iocounter_dec(struct nfs_io_counter *c)
-{
-       if (atomic_dec_and_test(&c->io_count)) {
-               clear_bit(NFS_IO_INPROGRESS, &c->flags);
-               smp_mb__after_atomic();
-               wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
-       }
-}
-
-static int
-__nfs_iocounter_wait(struct nfs_io_counter *c)
-{
-       wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
-       DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
-       int ret = 0;
-
-       do {
-               prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
-               set_bit(NFS_IO_INPROGRESS, &c->flags);
-               if (atomic_read(&c->io_count) == 0)
-                       break;
-               ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE);
-       } while (atomic_read(&c->io_count) != 0 && !ret);
-       finish_wait(wq, &q.wait);
-       return ret;
-}
-
 /**
  * nfs_iocounter_wait - wait for i/o to complete
- * @c: nfs_io_counter to use
+ * @l_ctx: nfs_lock_context with io_counter to use
  *
  * returns -ERESTARTSYS if interrupted by a fatal signal.
  * Otherwise returns 0 once the io_count hits 0.
  */
 int
-nfs_iocounter_wait(struct nfs_io_counter *c)
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
 {
-       if (atomic_read(&c->io_count) == 0)
-               return 0;
-       return __nfs_iocounter_wait(c);
+       return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
+                       TASK_KILLABLE);
 }
 
 /*
@@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
                return ERR_CAST(l_ctx);
        }
        req->wb_lock_context = l_ctx;
-       nfs_iocounter_inc(&l_ctx->io_count);
+       atomic_inc(&l_ctx->io_count);
 
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
@@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req)
                req->wb_page = NULL;
        }
        if (l_ctx != NULL) {
-               nfs_iocounter_dec(&l_ctx->io_count);
+               if (atomic_dec_and_test(&l_ctx->io_count))
+                       wake_up_atomic_t(&l_ctx->io_count);
                nfs_put_lock_context(l_ctx);
                req->wb_lock_context = NULL;
        }
@@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
  * @desc: IO descriptor
  * @hdr: pageio header
  */
-static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
-                         struct nfs_pgio_header *hdr)
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_mirror *mirror;
-       u32 midx;
-
        set_bit(NFS_IOHDR_REDO, &hdr->flags);
        nfs_pgio_data_destroy(hdr);
        hdr->completion_ops->completion(hdr);
-       /* TODO: Make sure it's right to clean up all mirrors here
-        *       and not just hdr->pgio_mirror_idx */
-       for (midx = 0; midx < desc->pg_mirror_count; midx++) {
-               mirror = &desc->pg_mirrors[midx];
-               desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-       }
-       return -ENOMEM;
 }
 
 /**
@@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
        unsigned int pagecount, pageused;
 
        pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
-       if (!nfs_pgarray_set(&hdr->page_array, pagecount))
-               return nfs_pgio_error(desc, hdr);
+       if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
+               nfs_pgio_error(hdr);
+               desc->pg_error = -ENOMEM;
+               return desc->pg_error;
+       }
 
        nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
        pages = hdr->page_array.pagevec;
@@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                        *pages++ = last_page = req->wb_page;
                }
        }
-       if (WARN_ON_ONCE(pageused != pagecount))
-               return nfs_pgio_error(desc, hdr);
+       if (WARN_ON_ONCE(pageused != pagecount)) {
+               nfs_pgio_error(hdr);
+               desc->pg_error = -EINVAL;
+               return desc->pg_error;
+       }
 
        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
            (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_pgio_mirror *mirror;
        struct nfs_pgio_header *hdr;
        int ret;
 
-       mirror = nfs_pgio_current_mirror(desc);
-
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-               /* TODO: make sure this is right with mirroring - or
-                *       should it back out all mirrors? */
-               desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-               return -ENOMEM;
+               desc->pg_error = -ENOMEM;
+               return desc->pg_error;
        }
        nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
        ret = nfs_generic_pgio(desc, hdr);
@@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
 
        mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
 
+       if (pgio->pg_error < 0)
+               return pgio->pg_error;
+
        if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
                return -EINVAL;
 
@@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
        pgio->pg_mirrors_dynamic = NULL;
 }
 
-static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
-               const struct nfs_open_context *ctx2)
-{
-       return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
-}
-
 static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
                const struct nfs_lock_context *l2)
 {
@@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
        } else {
                if (desc->pg_ops->pg_init)
                        desc->pg_ops->pg_init(desc, req);
+               if (desc->pg_error < 0)
+                       return 0;
                mirror->pg_base = req->wb_pgbase;
        }
        if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        bytes = req->wb_bytes;
 
        nfs_pageio_setup_mirroring(desc, req);
+       if (desc->pg_error < 0)
+               goto out_failed;
 
        for (midx = 0; midx < desc->pg_mirror_count; midx++) {
                if (midx) {
@@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 
                        if (IS_ERR(dupreq)) {
                                nfs_page_group_unlock(req);
-                               return 0;
+                               desc->pg_error = PTR_ERR(dupreq);
+                               goto out_failed;
                        }
 
                        nfs_lock_request(dupreq);
@@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                if (nfs_pgio_has_mirroring(desc))
                        desc->pg_mirror_idx = midx;
                if (!nfs_pageio_add_request_mirror(desc, dupreq))
-                       return 0;
+                       goto out_failed;
        }
 
        return 1;
+
+out_failed:
+       /*
+        * We might have failed before sending any reqs over wire.
+        * Clean up rest of the reqs in mirror pg_list.
+        */
+       if (desc->pg_error) {
+               struct nfs_pgio_mirror *mirror;
+               void (*func)(struct list_head *);
+
+               /* remember fatal errors */
+               if (nfs_error_is_fatal(desc->pg_error))
+                       mapping_set_error(desc->pg_inode->i_mapping,
+                                         desc->pg_error);
+
+               func = desc->pg_completion_ops->error_cleanup;
+               for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+                       mirror = &desc->pg_mirrors[midx];
+                       func(&mirror->pg_list);
+               }
+       }
+       return 0;
 }
 
 /*
@@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
        nfs_pageio_complete(desc);
        if (!list_empty(&failed)) {
                list_move(&failed, &hdr->pages);
-               return -EIO;
+               return desc->pg_error < 0 ? desc->pg_error : -EIO;
        }
        return 0;
 }
index bec0384..a3592cc 100644 (file)
@@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
 static LIST_HEAD(pnfs_modules_tbl);
 
 static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
                       enum pnfs_iomode iomode, bool sync);
 
 /* Return the registered pnfs layout driver module matching given id */
@@ -385,13 +385,13 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
                enum pnfs_iomode iomode;
                bool send;
 
-               stateid = lo->plh_stateid;
+               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
                iomode = lo->plh_return_iomode;
                send = pnfs_prepare_layoutreturn(lo);
                spin_unlock(&inode->i_lock);
                if (send) {
                        /* Send an async layoutreturn so we dont deadlock */
-                       pnfs_send_layoutreturn(lo, stateid, iomode, false);
+                       pnfs_send_layoutreturn(lo, &stateid, iomode, false);
                }
        } else
                spin_unlock(&inode->i_lock);
@@ -566,10 +566,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 int
 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                            struct list_head *tmp_list,
-                           struct pnfs_layout_range *recall_range)
+                           const struct pnfs_layout_range *recall_range)
 {
        struct pnfs_layout_segment *lseg, *next;
-       int invalid = 0, removed = 0;
+       int remaining = 0;
 
        dprintk("%s:Begin lo %p\n", __func__, lo);
 
@@ -582,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                "offset %llu length %llu\n", __func__,
                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
                                lseg->pls_range.length);
-                       invalid++;
-                       removed += mark_lseg_invalid(lseg, tmp_list);
+                       if (!mark_lseg_invalid(lseg, tmp_list))
+                               remaining++;
                }
-       dprintk("%s:Return %i\n", __func__, invalid - removed);
-       return invalid - removed;
+       dprintk("%s:Return %i\n", __func__, remaining);
+       return remaining;
 }
 
 /* note free_me must contain lsegs from a single layout_hdr */
@@ -618,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
                pnfs_get_layout_hdr(lo);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
-               pnfs_clear_retry_layoutget(lo);
                spin_unlock(&nfsi->vfs_inode.i_lock);
                pnfs_free_lseg_list(&tmp_list);
                pnfs_put_layout_hdr(lo);
@@ -703,6 +702,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
                        ret = -EAGAIN;
                spin_unlock(&inode->i_lock);
                pnfs_free_lseg_list(&lseg_list);
+               /* Free all lsegs that are attached to commit buckets */
+               nfs_commit_inode(inode, 0);
                pnfs_put_layout_hdr(lo);
                iput(inode);
        }
@@ -826,7 +827,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 
 int
 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-                             struct pnfs_layout_range *range,
+                             const struct pnfs_layout_range *range,
                              struct nfs4_state *open_state)
 {
        int status = 0;
@@ -861,7 +862,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
-          struct pnfs_layout_range *range,
+          const struct pnfs_layout_range *range,
           gfp_t gfp_flags)
 {
        struct inode *ino = lo->plh_inode;
@@ -894,7 +895,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
                                lgp->args.minlength = i_size - range->offset;
                }
                lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-               lgp->args.range = *range;
+               pnfs_copy_range(&lgp->args.range, range);
                lgp->args.type = server->pnfs_curr_ld->id;
                lgp->args.inode = ino;
                lgp->args.ctx = get_nfs_open_context(ctx);
@@ -904,17 +905,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
                lseg = nfs4_proc_layoutget(lgp, gfp_flags);
        } while (lseg == ERR_PTR(-EAGAIN));
 
-       if (IS_ERR(lseg)) {
-               switch (PTR_ERR(lseg)) {
-               case -ENOMEM:
-               case -ERESTARTSYS:
-                       break;
-               default:
-                       /* remember that LAYOUTGET failed and suspend trying */
-                       pnfs_layout_io_set_failed(lo, range->iomode);
-               }
-               return NULL;
-       } else
+       if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
+               lseg = NULL;
+       else
                pnfs_layout_clear_fail_bit(lo,
                                pnfs_iomode_to_fail_bit(range->iomode));
 
@@ -945,7 +938,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 }
 
 static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
                       enum pnfs_iomode iomode, bool sync)
 {
        struct inode *ino = lo->plh_inode;
@@ -962,7 +955,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
                goto out;
        }
 
-       lrp->args.stateid = stateid;
+       nfs4_stateid_copy(&lrp->args.stateid, stateid);
        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
        lrp->args.inode = ino;
        lrp->args.range.iomode = iomode;
@@ -1005,7 +998,7 @@ _pnfs_return_layout(struct inode *ino)
                dprintk("NFS: %s no layout to return\n", __func__);
                goto out;
        }
-       stateid = nfsi->layout->plh_stateid;
+       nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
        /* Reference matched in nfs4_layoutreturn_release */
        pnfs_get_layout_hdr(lo);
        empty = list_empty(&lo->plh_segs);
@@ -1033,7 +1026,7 @@ _pnfs_return_layout(struct inode *ino)
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
        if (send)
-               status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+               status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
 out_put_layout_hdr:
        pnfs_put_layout_hdr(lo);
 out:
@@ -1096,13 +1089,12 @@ bool pnfs_roc(struct inode *ino)
                        goto out_noroc;
        }
 
-       stateid = lo->plh_stateid;
+       nfs4_stateid_copy(&stateid, &lo->plh_stateid);
        /* always send layoutreturn if being marked so */
        if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
                                   &lo->plh_flags))
                layoutreturn = pnfs_prepare_layoutreturn(lo);
 
-       pnfs_clear_retry_layoutget(lo);
        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
                /* If we are sending layoutreturn, invalidate all valid lsegs */
                if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
@@ -1124,7 +1116,7 @@ out_noroc:
        pnfs_free_lseg_list(&tmp_list);
        pnfs_layoutcommit_inode(ino, true);
        if (layoutreturn)
-               pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+               pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
        return roc;
 }
 
@@ -1149,6 +1141,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 
        spin_lock(&ino->i_lock);
        lo = NFS_I(ino)->layout;
+       pnfs_mark_layout_returned_if_empty(lo);
        if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
                lo->plh_barrier = barrier;
        spin_unlock(&ino->i_lock);
@@ -1465,25 +1458,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
        return ret;
 }
 
-/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
-static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode)
-{
-       if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
-               return 1;
-       return nfs_wait_bit_killable(key, mode);
-}
-
 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
 {
-       if (!pnfs_should_retry_layoutget(lo))
-               return false;
        /*
         * send layoutcommit as it can hold up layoutreturn due to lseg
         * reference
         */
        pnfs_layoutcommit_inode(lo->plh_inode, false);
        return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
-                                  pnfs_layoutget_retry_bit_wait,
+                                  nfs_wait_bit_killable,
                                   TASK_UNINTERRUPTIBLE);
 }
 
@@ -1520,14 +1503,23 @@ pnfs_update_layout(struct inode *ino,
        struct pnfs_layout_segment *lseg = NULL;
        bool first;
 
-       if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+       if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+                                PNFS_UPDATE_LAYOUT_NO_PNFS);
                goto out;
+       }
 
-       if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+       if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+                                PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
                goto out;
+       }
 
-       if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+       if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+                                PNFS_UPDATE_LAYOUT_MDSTHRESH);
                goto out;
+       }
 
 lookup_again:
        first = false;
@@ -1535,19 +1527,25 @@ lookup_again:
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
                spin_unlock(&ino->i_lock);
+               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+                                PNFS_UPDATE_LAYOUT_NOMEM);
                goto out;
        }
 
        /* Do we even need to bother with this? */
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                                PNFS_UPDATE_LAYOUT_BULK_RECALL);
                dprintk("%s matches recall, use MDS\n", __func__);
                goto out_unlock;
        }
 
        /* if LAYOUTGET already failed once we don't try again */
-       if (pnfs_layout_io_test_failed(lo, iomode) &&
-           !pnfs_should_retry_layoutget(lo))
+       if (pnfs_layout_io_test_failed(lo, iomode)) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                                PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
                goto out_unlock;
+       }
 
        first = list_empty(&lo->plh_segs);
        if (first) {
@@ -1567,8 +1565,11 @@ lookup_again:
                 * already exists
                 */
                lseg = pnfs_find_lseg(lo, &arg);
-               if (lseg)
+               if (lseg) {
+                       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                                       PNFS_UPDATE_LAYOUT_FOUND_CACHED);
                        goto out_unlock;
+               }
        }
 
        /*
@@ -1585,11 +1586,16 @@ lookup_again:
                        dprintk("%s retrying\n", __func__);
                        goto lookup_again;
                }
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                               PNFS_UPDATE_LAYOUT_RETURN);
                goto out_put_layout_hdr;
        }
 
-       if (pnfs_layoutgets_blocked(lo))
+       if (pnfs_layoutgets_blocked(lo)) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                               PNFS_UPDATE_LAYOUT_BLOCKED);
                goto out_unlock;
+       }
        atomic_inc(&lo->plh_outstanding);
        spin_unlock(&ino->i_lock);
 
@@ -1612,8 +1618,9 @@ lookup_again:
                arg.length = PAGE_CACHE_ALIGN(arg.length);
 
        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
-       pnfs_clear_retry_layoutget(lo);
        atomic_dec(&lo->plh_outstanding);
+       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                                PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 out_put_layout_hdr:
        if (first)
                pnfs_clear_first_layoutget(lo);
@@ -1623,7 +1630,7 @@ out:
                        "(%s, offset: %llu, length: %llu)\n",
                        __func__, ino->i_sb->s_id,
                        (unsigned long long)NFS_FILEID(ino),
-                       lseg == NULL ? "not found" : "found",
+                       IS_ERR_OR_NULL(lseg) ? "not found" : "found",
                        iomode==IOMODE_RW ?  "read/write" : "read-only",
                        (unsigned long long)pos,
                        (unsigned long long)count);
@@ -1730,16 +1737,29 @@ out_forget_reply:
 }
 
 static void
+pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+{
+       if (lo->plh_return_iomode == iomode)
+               return;
+       if (lo->plh_return_iomode != 0)
+               iomode = IOMODE_ANY;
+       lo->plh_return_iomode = iomode;
+}
+
+int
 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               struct pnfs_layout_range *return_range)
+                               const struct pnfs_layout_range *return_range)
 {
        struct pnfs_layout_segment *lseg, *next;
+       int remaining = 0;
 
        dprintk("%s:Begin lo %p\n", __func__, lo);
 
        if (list_empty(&lo->plh_segs))
-               return;
+               return 0;
+
+       assert_spin_locked(&lo->plh_inode->i_lock);
 
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (should_free_lseg(&lseg->pls_range, return_range)) {
@@ -1749,38 +1769,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                lseg->pls_range.offset,
                                lseg->pls_range.length);
                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
-                       mark_lseg_invalid(lseg, tmp_list);
+                       pnfs_set_plh_return_iomode(lo, return_range->iomode);
+                       if (!mark_lseg_invalid(lseg, tmp_list))
+                               remaining++;
                        set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
                                        &lo->plh_flags);
                }
+       return remaining;
 }
 
 void pnfs_error_mark_layout_for_return(struct inode *inode,
                                       struct pnfs_layout_segment *lseg)
 {
        struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
-       int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
        struct pnfs_layout_range range = {
                .iomode = lseg->pls_range.iomode,
                .offset = 0,
                .length = NFS4_MAX_UINT64,
        };
        LIST_HEAD(free_me);
+       bool return_now = false;
 
        spin_lock(&inode->i_lock);
-       /* set failure bit so that pnfs path will be retried later */
-       pnfs_layout_set_fail_bit(lo, iomode);
-       if (lo->plh_return_iomode == 0)
-               lo->plh_return_iomode = range.iomode;
-       else if (lo->plh_return_iomode != range.iomode)
-               lo->plh_return_iomode = IOMODE_ANY;
+       pnfs_set_plh_return_iomode(lo, range.iomode);
        /*
         * mark all matching lsegs so that we are sure to have no live
         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
         * for how it works.
         */
-       pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
-       spin_unlock(&inode->i_lock);
+       if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+               nfs4_stateid stateid;
+               enum pnfs_iomode iomode = lo->plh_return_iomode;
+
+               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+               return_now = pnfs_prepare_layoutreturn(lo);
+               spin_unlock(&inode->i_lock);
+               if (return_now)
+                       pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+       } else {
+               spin_unlock(&inode->i_lock);
+               nfs_commit_inode(inode, 0);
+       }
        pnfs_free_lseg_list(&free_me);
 }
 EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -1802,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
                                                   rd_size,
                                                   IOMODE_READ,
                                                   GFP_KERNEL);
+               if (IS_ERR(pgio->pg_lseg)) {
+                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+                       pgio->pg_lseg = NULL;
+                       return;
+               }
        }
        /* If no lseg, fall back to read through mds */
        if (pgio->pg_lseg == NULL)
@@ -1814,13 +1848,19 @@ void
 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                           struct nfs_page *req, u64 wb_size)
 {
-       if (pgio->pg_lseg == NULL)
+       if (pgio->pg_lseg == NULL) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   req->wb_context,
                                                   req_offset(req),
                                                   wb_size,
                                                   IOMODE_RW,
                                                   GFP_NOFS);
+               if (IS_ERR(pgio->pg_lseg)) {
+                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+                       pgio->pg_lseg = NULL;
+                       return;
+               }
+       }
        /* If no lseg, fall back to write through mds */
        if (pgio->pg_lseg == NULL)
                nfs_pageio_reset_write_mds(pgio);
@@ -1988,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
        struct nfs_pgio_header *hdr;
        int ret;
 
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-               desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-               return -ENOMEM;
+               desc->pg_error = -ENOMEM;
+               return desc->pg_error;
        }
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
 
@@ -2119,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
        struct nfs_pgio_header *hdr;
        int ret;
 
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-               desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-               return -ENOMEM;
+               desc->pg_error = -ENOMEM;
+               return desc->pg_error;
        }
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
index d1990e9..9f4e2a4 100644 (file)
@@ -98,7 +98,6 @@ enum {
        NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
        NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget */
-       NFS_LAYOUT_RETRY_LAYOUTGET,     /* Retry layoutget */
 };
 
 enum layoutdriver_policy_flags {
@@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             bool update_barrier);
 int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
                                  struct pnfs_layout_hdr *lo,
-                                 struct pnfs_layout_range *range,
+                                 const struct pnfs_layout_range *range,
                                  struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               struct pnfs_layout_range *recall_range);
+                               const struct pnfs_layout_range *recall_range);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+                               struct list_head *tmp_list,
+                               const struct pnfs_layout_range *recall_range);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
        return d;
 }
 
-static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
-       if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
-               atomic_inc(&lo->plh_refcount);
-}
-
-static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
-       if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
-               atomic_dec(&lo->plh_refcount);
-               /* wake up waiters for LAYOUTRETURN as that is not needed */
-               wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
-       }
-}
-
-static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
-       return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
-}
-
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
        return lseg;
 }
 
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+       return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
 /* Return true if a layout driver is being used for this mountpoint */
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 {
@@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end)
        return 1 + end - offset;
 }
 
+/**
+ * pnfs_mark_layout_returned_if_empty - marks the layout as returned
+ * @lo: layout header
+ *
+ * Note: Caller must hold inode->i_lock
+ */
+static inline void
+pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
+{
+       if (list_empty(&lo->plh_segs))
+               set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+               const struct pnfs_layout_range *src)
+{
+       memcpy(dst, src, sizeof(*dst));
+}
+
 extern unsigned int layoutstats_timer;
 
 #ifdef NFS_DEBUG
index 24655b8..81ac648 100644 (file)
@@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                } else {
                        nfs_retry_commit(mds_pages, NULL, cinfo, 0);
                        pnfs_generic_retry_commit(cinfo, 0);
-                       cinfo->completion_ops->error_cleanup(NFS_I(inode));
                        return -ENOMEM;
                }
        }
 
        nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
 
-       if (nreq == 0) {
-               cinfo->completion_ops->error_cleanup(NFS_I(inode));
+       if (nreq == 0)
                goto out;
-       }
 
        atomic_add(nreq, &cinfo->mds->rpcs_out);
 
@@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
        buckets = cinfo->ds->buckets;
        list = &buckets[ds_commit_idx].written;
        if (list_empty(list)) {
+               if (!pnfs_is_valid_lseg(lseg)) {
+                       spin_unlock(cinfo->lock);
+                       cinfo->completion_ops->resched_write(cinfo, req);
+                       return;
+               }
                /* Non-empty buckets hold a reference on the lseg.  That ref
                 * is normally transferred to the COMMIT call and released
                 * there.  It could also be released if the last req is pulled
index 0a5e33f..eb31e23 100644 (file)
@@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
+static void nfs_readpage_release(struct nfs_page *req)
+{
+       struct inode *inode = d_inode(req->wb_context->dentry);
+
+       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+               (long long)req_offset(req));
+
+       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+               if (PageUptodate(req->wb_page))
+                       nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+               unlock_page(req->wb_page);
+       }
+       nfs_release_request(req);
+}
+
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                       struct page *page)
 {
@@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 
        nfs_pageio_init_read(&pgio, inode, false,
                             &nfs_async_read_completion_ops);
-       nfs_pageio_add_request(&pgio, new);
+       if (!nfs_pageio_add_request(&pgio, new)) {
+               nfs_list_remove_request(new);
+               nfs_readpage_release(new);
+       }
        nfs_pageio_complete(&pgio);
 
        /* It doesn't make sense to do mirrored reads! */
@@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        pgm = &pgio.pg_mirrors[0];
        NFS_I(inode)->read_io += pgm->pg_bytes_written;
 
-       return 0;
-}
-
-static void nfs_readpage_release(struct nfs_page *req)
-{
-       struct inode *inode = d_inode(req->wb_context->dentry);
-
-       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
-               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
-               (long long)req_offset(req));
-
-       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
-               if (PageUptodate(req->wb_page))
-                       nfs_readpage_to_fscache(inode, req->wb_page, 0);
-
-               unlock_page(req->wb_page);
-       }
-       nfs_release_request(req);
+       return pgio.pg_error < 0 ? pgio.pg_error : 0;
 }
 
 static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -361,6 +364,8 @@ readpage_async_filler(void *data, struct page *page)
        if (len < PAGE_CACHE_SIZE)
                zero_user_segment(page, len, PAGE_CACHE_SIZE);
        if (!nfs_pageio_add_request(desc->pgio, new)) {
+               nfs_list_remove_request(new);
+               nfs_readpage_release(new);
                error = desc->pgio->pg_error;
                goto out_unlock;
        }
index 7b93164..ce43cd6 100644 (file)
@@ -21,6 +21,8 @@
 #include <linux/nfs_page.h>
 #include <linux/backing-dev.h>
 #include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
 
 #include <asm/uaccess.h>
 
@@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc)
 {
        int ret = 0;
        if (wbc->for_reclaim)
-               return FLUSH_HIGHPRI | FLUSH_STABLE;
+               return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
        if (wbc->sync_mode == WB_SYNC_ALL)
                ret = FLUSH_COND_STABLE;
-       if (wbc->for_kupdate || wbc->for_background)
-               ret |= FLUSH_LOWPRI;
        return ret;
 }
 
@@ -545,12 +545,22 @@ try_again:
        return head;
 }
 
+static void nfs_write_error_remove_page(struct nfs_page *req)
+{
+       nfs_unlock_request(req);
+       nfs_end_page_writeback(req);
+       nfs_release_request(req);
+       generic_error_remove_page(page_file_mapping(req->wb_page),
+                                 req->wb_page);
+}
+
 /*
  * Find an associated nfs write request, and prepare to flush it out
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-                               struct page *page, bool nonblock)
+                               struct page *page, bool nonblock,
+                               bool launder)
 {
        struct nfs_page *req;
        int ret = 0;
@@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 
        ret = 0;
        if (!nfs_pageio_add_request(pgio, req)) {
-               nfs_redirty_request(req);
                ret = pgio->pg_error;
+               /*
+                * Remove the problematic req upon fatal errors
+                * in launder case, while other dirty pages can
+                * still be around until they get flushed.
+                */
+               if (nfs_error_is_fatal(ret)) {
+                       nfs_context_set_write_error(req->wb_context, ret);
+                       if (launder) {
+                               nfs_write_error_remove_page(req);
+                               goto out;
+                       }
+               }
+               nfs_redirty_request(req);
+               ret = -EAGAIN;
        } else
                nfs_add_stats(page_file_mapping(page)->host,
                                NFSIOS_WRITEPAGES, 1);
@@ -576,12 +599,14 @@ out:
        return ret;
 }
 
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+                           struct nfs_pageio_descriptor *pgio, bool launder)
 {
        int ret;
 
        nfs_pageio_cond_complete(pgio, page_file_index(page));
-       ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+       ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
+                                  launder);
        if (ret == -EAGAIN) {
                redirty_page_for_writepage(wbc, page);
                ret = 0;
@@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 /*
  * Write an mmapped page to the server.
  */
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_writepage_locked(struct page *page,
+                               struct writeback_control *wbc,
+                               bool launder)
 {
        struct nfs_pageio_descriptor pgio;
        struct inode *inode = page_file_mapping(page)->host;
@@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
        nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
                                false, &nfs_async_write_completion_ops);
-       err = nfs_do_writepage(page, wbc, &pgio);
+       err = nfs_do_writepage(page, wbc, &pgio, launder);
        nfs_pageio_complete(&pgio);
        if (err < 0)
                return err;
@@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
        int ret;
 
-       ret = nfs_writepage_locked(page, wbc);
+       ret = nfs_writepage_locked(page, wbc, false);
        unlock_page(page);
        return ret;
 }
@@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 {
        int ret;
 
-       ret = nfs_do_writepage(page, wbc, data);
+       ret = nfs_do_writepage(page, wbc, data, false);
        unlock_page(page);
        return ret;
 }
@@ -1128,7 +1155,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                if (req == NULL)
                        return 0;
                l_ctx = req->wb_lock_context;
-               do_flush = req->wb_page != page || req->wb_context != ctx;
+               do_flush = req->wb_page != page ||
+                       !nfs_match_open_context(req->wb_context, ctx);
                /* for now, flush if more than 1 request in page_group */
                do_flush |= req->wb_this_page != req;
                if (l_ctx && flctx &&
@@ -1326,9 +1354,15 @@ static void nfs_async_write_error(struct list_head *head)
        }
 }
 
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+       nfs_async_write_error(&hdr->pages);
+}
+
 static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
        .error_cleanup = nfs_async_write_error,
        .completion = nfs_write_completion,
+       .reschedule_io = nfs_async_write_reschedule_io,
 };
 
 void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -1529,27 +1563,21 @@ static void nfs_writeback_result(struct rpc_task *task,
        }
 }
 
-
-static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
 {
-       int ret;
+       return wait_on_atomic_t(&cinfo->rpcs_out,
+                       nfs_wait_atomic_killable, TASK_KILLABLE);
+}
 
-       if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
-               return 1;
-       if (!may_wait)
-               return 0;
-       ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
-                               NFS_INO_COMMIT,
-                               nfs_wait_bit_killable,
-                               TASK_KILLABLE);
-       return (ret < 0) ? ret : 1;
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+       atomic_inc(&cinfo->rpcs_out);
 }
 
-static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
 {
-       clear_bit(NFS_INO_COMMIT, &nfsi->flags);
-       smp_mb__after_atomic();
-       wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+       if (atomic_dec_and_test(&cinfo->rpcs_out))
+               wake_up_atomic_t(&cinfo->rpcs_out);
 }
 
 void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1666,6 +1694,13 @@ void nfs_retry_commit(struct list_head *page_list,
 }
 EXPORT_SYMBOL_GPL(nfs_retry_commit);
 
+static void
+nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+               struct nfs_page *req)
+{
+       __set_page_dirty_nobuffers(req->wb_page);
+}
+
 /*
  * Commit dirty pages
  */
@@ -1687,7 +1722,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
                                   data->mds_ops, how, 0);
  out_bad:
        nfs_retry_commit(head, NULL, cinfo, 0);
-       cinfo->completion_ops->error_cleanup(NFS_I(inode));
        return -ENOMEM;
 }
 
@@ -1749,8 +1783,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 
        nfs_init_cinfo(&cinfo, data->inode, data->dreq);
-       if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
-               nfs_commit_clear_lock(NFS_I(data->inode));
+       nfs_commit_end(cinfo.mds);
 }
 
 static void nfs_commit_release(void *calldata)
@@ -1769,7 +1802,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
 
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
        .completion = nfs_commit_release_pages,
-       .error_cleanup = nfs_commit_clear_lock,
+       .resched_write = nfs_commit_resched_write,
 };
 
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
@@ -1788,30 +1821,25 @@ int nfs_commit_inode(struct inode *inode, int how)
        LIST_HEAD(head);
        struct nfs_commit_info cinfo;
        int may_wait = how & FLUSH_SYNC;
+       int error = 0;
        int res;
 
-       res = nfs_commit_set_lock(NFS_I(inode), may_wait);
-       if (res <= 0)
-               goto out_mark_dirty;
        nfs_init_cinfo_from_inode(&cinfo, inode);
+       nfs_commit_begin(cinfo.mds);
        res = nfs_scan_commit(inode, &head, &cinfo);
-       if (res) {
-               int error;
-
+       if (res)
                error = nfs_generic_commit_list(inode, &head, how, &cinfo);
-               if (error < 0)
-                       return error;
-               if (!may_wait)
-                       goto out_mark_dirty;
-               error = wait_on_bit_action(&NFS_I(inode)->flags,
-                               NFS_INO_COMMIT,
-                               nfs_wait_bit_killable,
-                               TASK_KILLABLE);
-               if (error < 0)
-                       return error;
-       } else
-               nfs_commit_clear_lock(NFS_I(inode));
+       nfs_commit_end(cinfo.mds);
+       if (error < 0)
+               goto out_error;
+       if (!may_wait)
+               goto out_mark_dirty;
+       error = wait_on_commit(cinfo.mds);
+       if (error < 0)
+               return error;
        return res;
+out_error:
+       res = error;
        /* Note: If we exit without ensuring that the commit is complete,
         * we must mark the inode as dirty. Otherwise, future calls to
         * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
@@ -1821,6 +1849,7 @@ out_mark_dirty:
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return res;
 }
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
 
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
@@ -1911,7 +1940,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 /*
  * Write back all requests on one page - we do this before reading it.
  */
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
 {
        loff_t range_start = page_file_offset(page);
        loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
@@ -1928,7 +1957,7 @@ int nfs_wb_page(struct inode *inode, struct page *page)
        for (;;) {
                wait_on_page_writeback(page);
                if (clear_page_dirty_for_io(page)) {
-                       ret = nfs_writepage_locked(page, &wbc);
+                       ret = nfs_writepage_locked(page, &wbc, launder);
                        if (ret < 0)
                                goto out_error;
                        continue;
index 43aeabd..d6f9b4e 100644 (file)
@@ -592,4 +592,18 @@ enum data_content4 {
        NFS4_CONTENT_HOLE               = 1,
 };
 
+enum pnfs_update_layout_reason {
+       PNFS_UPDATE_LAYOUT_UNKNOWN = 0,
+       PNFS_UPDATE_LAYOUT_NO_PNFS,
+       PNFS_UPDATE_LAYOUT_RD_ZEROLEN,
+       PNFS_UPDATE_LAYOUT_MDSTHRESH,
+       PNFS_UPDATE_LAYOUT_NOMEM,
+       PNFS_UPDATE_LAYOUT_BULK_RECALL,
+       PNFS_UPDATE_LAYOUT_IO_TEST_FAIL,
+       PNFS_UPDATE_LAYOUT_FOUND_CACHED,
+       PNFS_UPDATE_LAYOUT_RETURN,
+       PNFS_UPDATE_LAYOUT_BLOCKED,
+       PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
+};
+
 #endif
index 37a3d29..48e0320 100644 (file)
@@ -60,18 +60,12 @@ struct nfs_lockowner {
        pid_t l_pid;
 };
 
-#define NFS_IO_INPROGRESS 0
-struct nfs_io_counter {
-       unsigned long flags;
-       atomic_t io_count;
-};
-
 struct nfs_lock_context {
        atomic_t count;
        struct list_head list;
        struct nfs_open_context *open_context;
        struct nfs_lockowner lockowner;
-       struct nfs_io_counter io_count;
+       atomic_t io_count;
 };
 
 struct nfs4_state;
@@ -216,7 +210,6 @@ struct nfs_inode {
 #define NFS_INO_FLUSHING       (4)             /* inode is flushing out data */
 #define NFS_INO_FSCACHE                (5)             /* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK   (6)             /* FS-Cache cookie management lock */
-#define NFS_INO_COMMIT         (7)             /* inode is committing unstable writes */
 #define NFS_INO_LAYOUTCOMMIT   (9)             /* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)          /* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS    (11)            /* layoutstats inflight */
@@ -518,12 +511,24 @@ extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
  */
 extern int nfs_sync_inode(struct inode *inode);
 extern int nfs_wb_all(struct inode *inode);
-extern int nfs_wb_page(struct inode *inode, struct page* page);
+extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_commit_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 
+static inline int
+nfs_wb_launder_page(struct inode *inode, struct page *page)
+{
+       return nfs_wb_single_page(inode, page, true);
+}
+
+static inline int
+nfs_wb_page(struct inode *inode, struct page *page)
+{
+       return nfs_wb_single_page(inode, page, false);
+}
+
 static inline int
 nfs_have_writebacks(struct inode *inode)
 {
index 2469ab0..7fcc13c 100644 (file)
@@ -102,6 +102,7 @@ struct nfs_client {
 #define NFS_SP4_MACH_CRED_STATEID  4   /* TEST_STATEID and FREE_STATEID */
 #define NFS_SP4_MACH_CRED_WRITE    5   /* WRITE */
 #define NFS_SP4_MACH_CRED_COMMIT   6   /* COMMIT */
+#define NFS_SP4_MACH_CRED_PNFS_CLEANUP  7 /* LAYOUTRETURN */
 #endif /* CONFIG_NFS_V4 */
 
        /* Our own IP address, as a null-terminated string.
index 11bbae4..791098a 100644 (file)
@@ -1375,6 +1375,7 @@ enum {
        NFS_IOHDR_ERROR = 0,
        NFS_IOHDR_EOF,
        NFS_IOHDR_REDO,
+       NFS_IOHDR_STAT,
 };
 
 struct nfs_pgio_header {
@@ -1420,11 +1421,12 @@ struct nfs_mds_commit_info {
        struct list_head        list;
 };
 
+struct nfs_commit_info;
 struct nfs_commit_data;
 struct nfs_inode;
 struct nfs_commit_completion_ops {
-       void (*error_cleanup) (struct nfs_inode *nfsi);
        void (*completion) (struct nfs_commit_data *data);
+       void (*resched_write) (struct nfs_commit_info *, struct nfs_page *);
 };
 
 struct nfs_commit_info {
@@ -1454,12 +1456,14 @@ struct nfs_commit_data {
        const struct rpc_call_ops *mds_ops;
        const struct nfs_commit_completion_ops *completion_ops;
        int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
+       unsigned long           flags;
 };
 
 struct nfs_pgio_completion_ops {
        void    (*error_cleanup)(struct list_head *head);
        void    (*init_hdr)(struct nfs_pgio_header *hdr);
        void    (*completion)(struct nfs_pgio_header *hdr);
+       void    (*reschedule_io)(struct nfs_pgio_header *hdr);
 };
 
 struct nfs_unlinkdata {
index 23608eb..b7f2104 100644 (file)
@@ -1217,6 +1217,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen)
                        return -EINVAL;
                memcpy(buf, &rpc_in6addr_loopback,
                                sizeof(rpc_in6addr_loopback));
+               break;
        default:
                dprintk("RPC:       %s: address family not supported\n",
                        __func__);
index 2dcb44f..cc1251d 100644 (file)
@@ -15,7 +15,7 @@
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-#define RPCRDMA_BACKCHANNEL_DEBUG
+#undef RPCRDMA_BACKCHANNEL_DEBUG
 
 static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
                                 struct rpc_rqst *rqst)
@@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
        size_t size;
 
        req = rpcrdma_create_req(r_xprt);
-       if (!req)
-               return -ENOMEM;
+       if (IS_ERR(req))
+               return PTR_ERR(req);
        req->rl_backchannel = true;
 
        size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
@@ -84,9 +84,7 @@ out_fail:
 static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
                                 unsigned int count)
 {
-       struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
        struct rpcrdma_rep *rep;
-       unsigned long flags;
        int rc = 0;
 
        while (count--) {
@@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
                        break;
                }
 
-               spin_lock_irqsave(&buffers->rb_lock, flags);
-               list_add(&rep->rr_list, &buffers->rb_recv_bufs);
-               spin_unlock_irqrestore(&buffers->rb_lock, flags);
+               rpcrdma_recv_buffer_put(rep);
        }
 
        return rc;
@@ -140,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
                               __func__);
                        goto out_free;
                }
+               dprintk("RPC:       %s: new rqst %p\n", __func__, rqst);
 
                rqst->rq_xprt = &r_xprt->rx_xprt;
                INIT_LIST_HEAD(&rqst->rq_list);
@@ -220,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 
        rpclen = rqst->rq_svec[0].iov_len;
 
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
        pr_info("RPC:       %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
                __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
        pr_info("RPC:       %s: RPC/RDMA: %*ph\n",
                __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
        pr_info("RPC:       %s:      RPC: %*ph\n",
                __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
+#endif
 
        req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
        req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
@@ -269,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
 {
        struct rpc_xprt *xprt = rqst->rq_xprt;
 
+       dprintk("RPC:       %s: freeing rqst %p (req %p)\n",
+               __func__, rqst, rpcr_to_rdmar(rqst));
+
        smp_mb__before_atomic();
        WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
        clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
@@ -333,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
                                struct rpc_rqst, rq_bc_pa_list);
        list_del(&rqst->rq_bc_pa_list);
        spin_unlock(&xprt->bc_pa_lock);
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
-       pr_info("RPC:       %s: using rqst %p\n", __func__, rqst);
-#endif
+       dprintk("RPC:       %s: using rqst %p\n", __func__, rqst);
 
        /* Prepare rqst */
        rqst->rq_reply_bytes_recvd = 0;
@@ -355,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
         * direction reply.
         */
        req = rpcr_to_rdmar(rqst);
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
-       pr_info("RPC:       %s: attaching rep %p to req %p\n",
+       dprintk("RPC:       %s: attaching rep %p to req %p\n",
                __func__, rep, req);
-#endif
        req->rl_reply = rep;
 
        /* Defeat the retransmit detection logic in send_request */
index f1e8daf..c14f3a4 100644 (file)
@@ -179,6 +179,69 @@ out_maperr:
        return rc;
 }
 
+static void
+__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+       struct ib_device *device = r_xprt->rx_ia.ri_device;
+       struct rpcrdma_mw *mw = seg->rl_mw;
+       int nsegs = seg->mr_nsegs;
+
+       seg->rl_mw = NULL;
+
+       while (nsegs--)
+               rpcrdma_unmap_one(device, seg++);
+
+       rpcrdma_put_mw(r_xprt, mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+       struct rpcrdma_mr_seg *seg;
+       unsigned int i, nchunks;
+       struct rpcrdma_mw *mw;
+       LIST_HEAD(unmap_list);
+       int rc;
+
+       dprintk("RPC:       %s: req %p\n", __func__, req);
+
+       /* ORDER: Invalidate all of the req's MRs first
+        *
+        * ib_unmap_fmr() is slow, so use a single call instead
+        * of one call per mapped MR.
+        */
+       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+               seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
+
+               list_add(&mw->r.fmr.fmr->list, &unmap_list);
+
+               i += seg->mr_nsegs;
+       }
+       rc = ib_unmap_fmr(&unmap_list);
+       if (rc)
+               pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+
+       /* ORDER: Now DMA unmap all of the req's MRs, and return
+        * them to the free MW list.
+        */
+       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+               seg = &req->rl_segments[i];
+
+               __fmr_dma_unmap(r_xprt, seg);
+
+               i += seg->mr_nsegs;
+               seg->mr_nsegs = 0;
+       }
+
+       req->rl_nchunks = 0;
+}
+
 /* Use the ib_unmap_fmr() verb to prevent further remote
  * access via RDMA READ or RDMA WRITE.
  */
@@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
 
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_map                         = fmr_op_map,
+       .ro_unmap_sync                  = fmr_op_unmap_sync,
        .ro_unmap                       = fmr_op_unmap,
        .ro_open                        = fmr_op_open,
        .ro_maxpages                    = fmr_op_maxpages,
index 88cf9e7..c683684 100644 (file)
@@ -245,12 +245,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
                     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
 }
 
-/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs
+ * to be reset.
+ *
+ * WARNING: Only wr_id and status are reliable at this point
+ */
 static void
-frwr_sendcompletion(struct ib_wc *wc)
+__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r)
 {
-       struct rpcrdma_mw *r;
-
        if (likely(wc->status == IB_WC_SUCCESS))
                return;
 
@@ -261,9 +263,23 @@ frwr_sendcompletion(struct ib_wc *wc)
        else
                pr_warn("RPC:       %s: frmr %p error, status %s (%d)\n",
                        __func__, r, ib_wc_status_msg(wc->status), wc->status);
+
        r->r.frmr.fr_state = FRMR_IS_STALE;
 }
 
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+       struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+       struct rpcrdma_frmr *f = &r->r.frmr;
+
+       if (unlikely(wc->status != IB_WC_SUCCESS))
+               __frwr_sendcompletion_flush(wc, r);
+
+       if (f->fr_waiter)
+               complete(&f->fr_linv_done);
+}
+
 static int
 frwr_op_init(struct rpcrdma_xprt *r_xprt)
 {
@@ -319,7 +335,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        struct rpcrdma_mw *mw;
        struct rpcrdma_frmr *frmr;
        struct ib_mr *mr;
-       struct ib_reg_wr reg_wr;
+       struct ib_reg_wr *reg_wr;
        struct ib_send_wr *bad_wr;
        int rc, i, n, dma_nents;
        u8 key;
@@ -335,7 +351,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        } while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
        frmr = &mw->r.frmr;
        frmr->fr_state = FRMR_IS_VALID;
+       frmr->fr_waiter = false;
        mr = frmr->fr_mr;
+       reg_wr = &frmr->fr_regwr;
 
        if (nsegs > ia->ri_max_frmr_depth)
                nsegs = ia->ri_max_frmr_depth;
@@ -381,19 +399,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        key = (u8)(mr->rkey & 0x000000FF);
        ib_update_fast_reg_key(mr, ++key);
 
-       reg_wr.wr.next = NULL;
-       reg_wr.wr.opcode = IB_WR_REG_MR;
-       reg_wr.wr.wr_id = (uintptr_t)mw;
-       reg_wr.wr.num_sge = 0;
-       reg_wr.wr.send_flags = 0;
-       reg_wr.mr = mr;
-       reg_wr.key = mr->rkey;
-       reg_wr.access = writing ?
-                       IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
-                       IB_ACCESS_REMOTE_READ;
+       reg_wr->wr.next = NULL;
+       reg_wr->wr.opcode = IB_WR_REG_MR;
+       reg_wr->wr.wr_id = (uintptr_t)mw;
+       reg_wr->wr.num_sge = 0;
+       reg_wr->wr.send_flags = 0;
+       reg_wr->mr = mr;
+       reg_wr->key = mr->rkey;
+       reg_wr->access = writing ?
+                        IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+                        IB_ACCESS_REMOTE_READ;
 
        DECR_CQCOUNT(&r_xprt->rx_ep);
-       rc = ib_post_send(ia->ri_id->qp, &reg_wr.wr, &bad_wr);
+       rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
        if (rc)
                goto out_senderr;
 
@@ -413,6 +431,116 @@ out_senderr:
        return rc;
 }
 
+static struct ib_send_wr *
+__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+{
+       struct rpcrdma_mw *mw = seg->rl_mw;
+       struct rpcrdma_frmr *f = &mw->r.frmr;
+       struct ib_send_wr *invalidate_wr;
+
+       f->fr_waiter = false;
+       f->fr_state = FRMR_IS_INVALID;
+       invalidate_wr = &f->fr_invwr;
+
+       memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+       invalidate_wr->wr_id = (unsigned long)(void *)mw;
+       invalidate_wr->opcode = IB_WR_LOCAL_INV;
+       invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
+
+       return invalidate_wr;
+}
+
+static void
+__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+                int rc)
+{
+       struct ib_device *device = r_xprt->rx_ia.ri_device;
+       struct rpcrdma_mw *mw = seg->rl_mw;
+       struct rpcrdma_frmr *f = &mw->r.frmr;
+
+       seg->rl_mw = NULL;
+
+       ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
+
+       if (!rc)
+               rpcrdma_put_mw(r_xprt, mw);
+       else
+               __frwr_queue_recovery(mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+       struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_mr_seg *seg;
+       unsigned int i, nchunks;
+       struct rpcrdma_frmr *f;
+       int rc;
+
+       dprintk("RPC:       %s: req %p\n", __func__, req);
+
+       /* ORDER: Invalidate all of the req's MRs first
+        *
+        * Chain the LOCAL_INV Work Requests and post them with
+        * a single ib_post_send() call.
+        */
+       invalidate_wrs = pos = prev = NULL;
+       seg = NULL;
+       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+               seg = &req->rl_segments[i];
+
+               pos = __frwr_prepare_linv_wr(seg);
+
+               if (!invalidate_wrs)
+                       invalidate_wrs = pos;
+               else
+                       prev->next = pos;
+               prev = pos;
+
+               i += seg->mr_nsegs;
+       }
+       f = &seg->rl_mw->r.frmr;
+
+       /* Strong send queue ordering guarantees that when the
+        * last WR in the chain completes, all WRs in the chain
+        * are complete.
+        */
+       f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+       f->fr_waiter = true;
+       init_completion(&f->fr_linv_done);
+       INIT_CQCOUNT(&r_xprt->rx_ep);
+
+       /* Transport disconnect drains the receive CQ before it
+        * replaces the QP. The RPC reply handler won't call us
+        * unless ri_id->qp is a valid pointer.
+        */
+       rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+       if (rc)
+               pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+
+       wait_for_completion(&f->fr_linv_done);
+
+       /* ORDER: Now DMA unmap all of the req's MRs, and return
+        * them to the free MW list.
+        */
+       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+               seg = &req->rl_segments[i];
+
+               __frwr_dma_unmap(r_xprt, seg, rc);
+
+               i += seg->mr_nsegs;
+               seg->mr_nsegs = 0;
+       }
+
+       req->rl_nchunks = 0;
+}
+
 /* Post a LOCAL_INV Work Request to prevent further remote access
  * via RDMA READ or RDMA WRITE.
  */
@@ -423,23 +551,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        struct rpcrdma_mw *mw = seg1->rl_mw;
        struct rpcrdma_frmr *frmr = &mw->r.frmr;
-       struct ib_send_wr invalidate_wr, *bad_wr;
+       struct ib_send_wr *invalidate_wr, *bad_wr;
        int rc, nsegs = seg->mr_nsegs;
 
        dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
 
        seg1->rl_mw = NULL;
        frmr->fr_state = FRMR_IS_INVALID;
+       invalidate_wr = &mw->r.frmr.fr_invwr;
 
-       memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-       invalidate_wr.wr_id = (unsigned long)(void *)mw;
-       invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
+       memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+       invalidate_wr->wr_id = (uintptr_t)mw;
+       invalidate_wr->opcode = IB_WR_LOCAL_INV;
+       invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
        ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
        read_lock(&ia->ri_qplock);
-       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+       rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
        read_unlock(&ia->ri_qplock);
        if (rc)
                goto out_err;
@@ -471,6 +600,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
 
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_map                         = frwr_op_map,
+       .ro_unmap_sync                  = frwr_op_unmap_sync,
        .ro_unmap                       = frwr_op_unmap,
        .ro_open                        = frwr_op_open,
        .ro_maxpages                    = frwr_op_maxpages,
index 617b76f..dbb302e 100644 (file)
@@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
        return 1;
 }
 
+/* DMA unmap all memory regions that were mapped for "req".
+ */
+static void
+physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+       struct ib_device *device = r_xprt->rx_ia.ri_device;
+       unsigned int i;
+
+       for (i = 0; req->rl_nchunks; --req->rl_nchunks)
+               rpcrdma_unmap_one(device, &req->rl_segments[i++]);
+}
+
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
 
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
        .ro_map                         = physical_op_map,
+       .ro_unmap_sync                  = physical_op_unmap_sync,
        .ro_unmap                       = physical_op_unmap,
        .ro_open                        = physical_op_open,
        .ro_maxpages                    = physical_op_maxpages,
index c10d969..0f28f2d 100644 (file)
@@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
        if (req->rl_reply)
                goto out_duplicate;
 
+       /* Sanity checking has passed. We are now committed
+        * to complete this transaction.
+        */
+       list_del_init(&rqst->rq_list);
+       spin_unlock_bh(&xprt->transport_lock);
        dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
                "                   RPC request 0x%p xid 0x%08x\n",
                        __func__, rep, req, rqst,
@@ -888,12 +893,23 @@ badheader:
                break;
        }
 
+       /* Invalidate and flush the data payloads before waking the
+        * waiting application. This guarantees the memory region is
+        * properly fenced from the server before the application
+        * accesses the data. It also ensures proper send flow
+        * control: waking the next RPC waits until this RPC has
+        * relinquished all its Send Queue entries.
+        */
+       if (req->rl_nchunks)
+               r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
+
        credits = be32_to_cpu(headerp->rm_credit);
        if (credits == 0)
                credits = 1;    /* don't deadlock */
        else if (credits > r_xprt->rx_buf.rb_max_requests)
                credits = r_xprt->rx_buf.rb_max_requests;
 
+       spin_lock_bh(&xprt->transport_lock);
        cwnd = xprt->cwnd;
        xprt->cwnd = credits << RPC_CWNDSHIFT;
        if (xprt->cwnd > cwnd)
index 8c545f7..740bddc 100644 (file)
@@ -576,6 +576,9 @@ xprt_rdma_free(void *buffer)
 
        rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
        req = rb->rg_owner;
+       if (req->rl_backchannel)
+               return;
+
        r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
 
        dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
index eadd165..732c71c 100644 (file)
@@ -616,10 +616,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
        /* set trigger for requesting send completion */
        ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
-       if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
-               ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
-       else if (ep->rep_cqinit <= 2)
-               ep->rep_cqinit = 0;
+       if (ep->rep_cqinit <= 2)
+               ep->rep_cqinit = 0;     /* always signal? */
        INIT_CQCOUNT(ep);
        init_waitqueue_head(&ep->rep_connect_wait);
        INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
@@ -852,10 +850,11 @@ retry:
 
                if (extras) {
                        rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
-                       if (rc)
+                       if (rc) {
                                pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
                                        __func__, rc);
                                rc = 0;
+                       }
                }
        }
 
@@ -1337,15 +1336,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        struct rpcrdma_ep *ep = &r_xprt->rx_ep;
        struct rpcrdma_rep *rep;
-       unsigned long flags;
        int rc;
 
        while (count--) {
-               spin_lock_irqsave(&buffers->rb_lock, flags);
+               spin_lock(&buffers->rb_lock);
                if (list_empty(&buffers->rb_recv_bufs))
                        goto out_reqbuf;
                rep = rpcrdma_buffer_get_rep_locked(buffers);
-               spin_unlock_irqrestore(&buffers->rb_lock, flags);
+               spin_unlock(&buffers->rb_lock);
 
                rc = rpcrdma_ep_post_recv(ia, ep, rep);
                if (rc)
@@ -1355,7 +1353,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
        return 0;
 
 out_reqbuf:
-       spin_unlock_irqrestore(&buffers->rb_lock, flags);
+       spin_unlock(&buffers->rb_lock);
        pr_warn("%s: no extra receive buffers\n", __func__);
        return -ENOMEM;
 
index ac7f8d4..728101d 100644 (file)
@@ -88,12 +88,6 @@ struct rpcrdma_ep {
        struct delayed_work     rep_connect_worker;
 };
 
-/*
- * Force a signaled SEND Work Request every so often,
- * in case the provider needs to do some housekeeping.
- */
-#define RPCRDMA_MAX_UNSIGNALED_SENDS   (32)
-
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
@@ -207,6 +201,12 @@ struct rpcrdma_frmr {
        enum rpcrdma_frmr_state         fr_state;
        struct work_struct              fr_work;
        struct rpcrdma_xprt             *fr_xprt;
+       bool                            fr_waiter;
+       struct completion               fr_linv_done;;
+       union {
+               struct ib_reg_wr        fr_regwr;
+               struct ib_send_wr       fr_invwr;
+       };
 };
 
 struct rpcrdma_fmr {
@@ -364,6 +364,8 @@ struct rpcrdma_xprt;
 struct rpcrdma_memreg_ops {
        int             (*ro_map)(struct rpcrdma_xprt *,
                                  struct rpcrdma_mr_seg *, int, bool);
+       void            (*ro_unmap_sync)(struct rpcrdma_xprt *,
+                                        struct rpcrdma_req *);
        int             (*ro_unmap)(struct rpcrdma_xprt *,
                                    struct rpcrdma_mr_seg *);
        int             (*ro_open)(struct rpcrdma_ia *,
index 2ffaf6a..fde2138 100644 (file)
@@ -398,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
        if (unlikely(!sock))
                return -ENOTSOCK;
 
-       clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags);
        if (base != 0) {
                addr = NULL;
                addrlen = 0;
@@ -442,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task)
        struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
 
        transport->inet->sk_write_pending--;
-       clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 }
 
 /**
@@ -467,20 +465,11 @@ static int xs_nospace(struct rpc_task *task)
 
        /* Don't race with disconnect */
        if (xprt_connected(xprt)) {
-               if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) {
-                       /*
-                        * Notify TCP that we're limited by the application
-                        * window size
-                        */
-                       set_bit(SOCK_NOSPACE, &transport->sock->flags);
-                       sk->sk_write_pending++;
-                       /* ...and wait for more buffer space */
-                       xprt_wait_for_buffer_space(task, xs_nospace_callback);
-               }
-       } else {
-               clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
+               /* wait for more buffer space */
+               sk->sk_write_pending++;
+               xprt_wait_for_buffer_space(task, xs_nospace_callback);
+       } else
                ret = -ENOTCONN;
-       }
 
        spin_unlock_bh(&xprt->transport_lock);
 
@@ -616,9 +605,6 @@ process_status:
        case -EAGAIN:
                status = xs_nospace(task);
                break;
-       default:
-               dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-                       -status);
        case -ENETUNREACH:
        case -ENOBUFS:
        case -EPIPE:
@@ -626,7 +612,10 @@ process_status:
        case -EPERM:
                /* When the server has died, an ICMP port unreachable message
                 * prompts ECONNREFUSED. */
-               clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
+               break;
+       default:
+               dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+                       -status);
        }
 
        return status;
@@ -706,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task)
        case -EAGAIN:
                status = xs_nospace(task);
                break;
-       default:
-               dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-                       -status);
        case -ECONNRESET:
        case -ECONNREFUSED:
        case -ENOTCONN:
        case -EADDRINUSE:
        case -ENOBUFS:
        case -EPIPE:
-               clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
+               break;
+       default:
+               dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+                       -status);
        }
 
        return status;
@@ -1609,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk)
 
 static void xs_write_space(struct sock *sk)
 {
-       struct socket *sock;
+       struct socket_wq *wq;
        struct rpc_xprt *xprt;
 
-       if (unlikely(!(sock = sk->sk_socket)))
+       if (!sk->sk_socket)
                return;
-       clear_bit(SOCK_NOSPACE, &sock->flags);
+       clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
        if (unlikely(!(xprt = xprt_from_sock(sk))))
                return;
-       if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0)
-               return;
+       rcu_read_lock();
+       wq = rcu_dereference(sk->sk_wq);
+       if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
+               goto out;
 
        xprt_write_space(xprt);
+out:
+       rcu_read_unlock();
 }
 
 /**
@@ -1907,18 +1900,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
        }
 }
 #else
-static inline void xs_reclassify_socketu(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket4(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket6(struct socket *sock)
-{
-}
-
 static inline void xs_reclassify_socket(int family, struct socket *sock)
 {
 }
@@ -2008,7 +1989,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
                        "transport socket (%d).\n", -status);
                goto out;
        }
-       xs_reclassify_socketu(sock);
+       xs_reclassify_socket(AF_LOCAL, sock);
 
        dprintk("RPC:       worker connecting xprt %p via AF_LOCAL to %s\n",
                        xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);