ceph: send TID of the oldest pending caps flush to MDS
authorYan, Zheng <zyan@redhat.com>
Wed, 10 Jun 2015 03:09:32 +0000 (11:09 +0800)
committerIlya Dryomov <idryomov@gmail.com>
Thu, 25 Jun 2015 08:49:31 +0000 (11:49 +0300)
According to this information, MDS can trim its completed caps flush
list (which is used to detect duplicated cap flush).

Signed-off-by: Yan, Zheng <zyan@redhat.com>
fs/ceph/caps.c

index 0295048..4202727 100644 (file)
@@ -986,8 +986,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 static int send_cap_msg(struct ceph_mds_session *session,
                        u64 ino, u64 cid, int op,
                        int caps, int wanted, int dirty,
-                       u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
-                       u64 size, u64 max_size,
+                       u32 seq, u64 flush_tid, u64 oldest_flush_tid,
+                       u32 issue_seq, u32 mseq, u64 size, u64 max_size,
                        struct timespec *mtime, struct timespec *atime,
                        u64 time_warp_seq,
                        kuid_t uid, kgid_t gid, umode_t mode,
@@ -1001,20 +1001,23 @@ static int send_cap_msg(struct ceph_mds_session *session,
        size_t extra_len;
 
        dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
-            " seq %u/%u mseq %u follows %lld size %llu/%llu"
+            " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
             " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
             cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
             ceph_cap_string(dirty),
-            seq, issue_seq, mseq, follows, size, max_size,
+            seq, issue_seq, flush_tid, oldest_flush_tid,
+            mseq, follows, size, max_size,
             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-       /* flock buffer size + inline version + inline data size */
-       extra_len = 4 + 8 + 4;
+       /* flock buffer size + inline version + inline data size +
+        * osd_epoch_barrier + oldest_flush_tid */
+       extra_len = 4 + 8 + 4 + 4 + 8;
        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
                           GFP_NOFS, false);
        if (!msg)
                return -ENOMEM;
 
+       msg->hdr.version = cpu_to_le16(6);
        msg->hdr.tid = cpu_to_le64(flush_tid);
 
        fc = msg->front.iov_base;
@@ -1050,6 +1053,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
        ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
        /* inline data size */
        ceph_encode_32(&p, 0);
+       /* osd_epoch_barrier */
+       ceph_encode_32(&p, 0);
+       /* oldest_flush_tid */
+       ceph_encode_64(&p, oldest_flush_tid);
 
        fc->xattr_version = cpu_to_le64(xattr_version);
        if (xattrs_buf) {
@@ -1098,7 +1105,7 @@ void ceph_queue_caps_release(struct inode *inode)
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                      int op, int used, int want, int retain, int flushing,
-                     u64 flush_tid)
+                     u64 flush_tid, u64 oldest_flush_tid)
        __releases(cap->ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = cap->ci;
@@ -1187,7 +1194,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        spin_unlock(&ci->i_ceph_lock);
 
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
-               op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+               op, keep, want, flushing, seq,
+               flush_tid, oldest_flush_tid, issue_seq, mseq,
                size, max_size, &mtime, &atime, time_warp_seq,
                uid, gid, mode, xattr_version, xattr_blob,
                follows, inline_data);
@@ -1307,8 +1315,8 @@ retry:
                     inode, capsnap, capsnap->follows, capsnap->flush_tid);
                send_cap_msg(session, ceph_vino(inode).ino, 0,
                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
-                            capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
-                            capsnap->size, 0,
+                            capsnap->dirty, 0, capsnap->flush_tid, 0,
+                            0, mseq, capsnap->size, 0,
                             &capsnap->mtime, &capsnap->atime,
                             capsnap->time_warp_seq,
                             capsnap->uid, capsnap->gid, capsnap->mode,
@@ -1438,6 +1446,17 @@ static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
        rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
 }
 
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+       struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
+       if (n) {
+               struct ceph_cap_flush *cf =
+                       rb_entry(n, struct ceph_cap_flush, g_node);
+               return cf->tid;
+       }
+       return 0;
+}
+
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
@@ -1446,7 +1465,7 @@ static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
  */
 static int __mark_caps_flushing(struct inode *inode,
                                struct ceph_mds_session *session,
-                               u64 *flush_tid)
+                               u64 *flush_tid, u64 *oldest_flush_tid)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1473,6 +1492,7 @@ static int __mark_caps_flushing(struct inode *inode,
 
        cf->tid = ++mdsc->last_cap_flush_tid;
        __add_cap_flushing_to_mdsc(mdsc, cf);
+       *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
 
        if (list_empty(&ci->i_flushing_item)) {
                list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
@@ -1533,7 +1553,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap *cap;
-       u64 flush_tid;
+       u64 flush_tid, oldest_flush_tid;
        int file_wanted, used, cap_used;
        int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
        int issued, implemented, want, retain, revoking, flushing = 0;
@@ -1754,10 +1774,14 @@ ack:
 
                if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
                        flushing = __mark_caps_flushing(inode, session,
-                                                       &flush_tid);
+                                                       &flush_tid,
+                                                       &oldest_flush_tid);
                } else {
                        flushing = 0;
                        flush_tid = 0;
+                       spin_lock(&mdsc->cap_dirty_lock);
+                       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+                       spin_unlock(&mdsc->cap_dirty_lock);
                }
 
                mds = cap->mds;  /* remember mds, so we don't repeat */
@@ -1765,7 +1789,8 @@ ack:
 
                /* __send_cap drops i_ceph_lock */
                delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
-                                     want, retain, flushing, flush_tid);
+                                     want, retain, flushing,
+                                     flush_tid, oldest_flush_tid);
                goto retry; /* retake i_ceph_lock and restart our cap scan. */
        }
 
@@ -1800,7 +1825,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_session *session = NULL;
        int flushing = 0;
-       u64 flush_tid = 0;
+       u64 flush_tid = 0, oldest_flush_tid = 0;
 
 retry:
        spin_lock(&ci->i_ceph_lock);
@@ -1825,12 +1850,13 @@ retry:
                if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
                        goto out;
 
-               flushing = __mark_caps_flushing(inode, session, &flush_tid);
+               flushing = __mark_caps_flushing(inode, session, &flush_tid,
+                                               &oldest_flush_tid);
 
                /* __send_cap drops i_ceph_lock */
                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
                                     (cap->issued | cap->implemented),
-                                    flushing, flush_tid);
+                                    flushing, flush_tid, oldest_flush_tid);
 
                if (delayed) {
                        spin_lock(&ci->i_ceph_lock);
@@ -2083,6 +2109,11 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
        struct rb_node *n;
        int delayed = 0;
        u64 first_tid = 0;
+       u64 oldest_flush_tid;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+       spin_unlock(&mdsc->cap_dirty_lock);
 
        while (true) {
                spin_lock(&ci->i_ceph_lock);
@@ -2113,7 +2144,7 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
                                      __ceph_caps_used(ci),
                                      __ceph_caps_wanted(ci),
                                      cap->issued | cap->implemented,
-                                     cf->caps, cf->tid);
+                                     cf->caps, cf->tid, oldest_flush_tid);
        }
        return delayed;
 }