Merge tag 'v3.18-rc1' into x86/urgent
[cascardo/linux.git] / fs / ceph / mds_client.c
index bad07c0..a92d3f5 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/utsname.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -334,7 +335,7 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 /*
  * sessions
  */
-static const char *session_state_name(int s)
+const char *ceph_session_state_name(int s)
 {
        switch (s) {
        case CEPH_MDS_SESSION_NEW: return "new";
@@ -542,6 +543,8 @@ void ceph_mdsc_release_request(struct kref *kref)
        }
        kfree(req->r_path1);
        kfree(req->r_path2);
+       if (req->r_pagelist)
+               ceph_pagelist_release(req->r_pagelist);
        put_request_session(req);
        ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
        kfree(req);
@@ -812,6 +815,74 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
        h = msg->front.iov_base;
        h->op = cpu_to_le32(op);
        h->seq = cpu_to_le64(seq);
+
+       return msg;
+}
+
+/*
+ * session message, specialization for CEPH_SESSION_REQUEST_OPEN
+ * to include additional client metadata fields.
+ */
+static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+{
+       struct ceph_msg *msg;
+       struct ceph_mds_session_head *h;
+       int i = -1;
+       int metadata_bytes = 0;
+       int metadata_key_count = 0;
+       struct ceph_options *opt = mdsc->fsc->client->options;
+       void *p;
+
+       const char* metadata[3][2] = {
+               {"hostname", utsname()->nodename},
+               {"entity_id", opt->name ? opt->name : ""},
+               {NULL, NULL}
+       };
+
+       /* Calculate serialized length of metadata */
+       metadata_bytes = 4;  /* map length */
+       for (i = 0; metadata[i][0] != NULL; ++i) {
+               metadata_bytes += 8 + strlen(metadata[i][0]) +
+                       strlen(metadata[i][1]);
+               metadata_key_count++;
+       }
+
+       /* Allocate the message */
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes,
+                          GFP_NOFS, false);
+       if (!msg) {
+               pr_err("create_session_msg ENOMEM creating msg\n");
+               return NULL;
+       }
+       h = msg->front.iov_base;
+       h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+       h->seq = cpu_to_le64(seq);
+
+       /*
+        * Serialize client metadata into waiting buffer space, using
+        * the format that userspace expects for map<string, string>
+        */
+       msg->hdr.version = 2;  /* ClientSession messages with metadata are v2 */
+
+       /* The write pointer, following the session_head structure */
+       p = msg->front.iov_base + sizeof(*h);
+
+       /* Number of entries in the map */
+       ceph_encode_32(&p, metadata_key_count);
+
+       /* Two length-prefixed strings for each entry in the map */
+       for (i = 0; metadata[i][0] != NULL; ++i) {
+               size_t const key_len = strlen(metadata[i][0]);
+               size_t const val_len = strlen(metadata[i][1]);
+
+               ceph_encode_32(&p, key_len);
+               memcpy(p, metadata[i][0], key_len);
+               p += key_len;
+               ceph_encode_32(&p, val_len);
+               memcpy(p, metadata[i][1], val_len);
+               p += val_len;
+       }
+
        return msg;
 }
 
@@ -835,7 +906,7 @@ static int __open_session(struct ceph_mds_client *mdsc,
        session->s_renew_requested = jiffies;
 
        /* send connect message */
-       msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+       msg = create_session_open_msg(mdsc, session->s_seq);
        if (!msg)
                return -ENOMEM;
        ceph_con_send(&session->s_con, msg);
@@ -1164,7 +1235,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
 
        dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
-            session->s_mds, session_state_name(session->s_state), seq);
+            session->s_mds, ceph_session_state_name(session->s_state), seq);
        msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
        if (!msg)
                return -ENOMEM;
@@ -1216,7 +1287,7 @@ static int request_close_session(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
 
        dout("request_close_session mds%d state %s seq %lld\n",
-            session->s_mds, session_state_name(session->s_state),
+            session->s_mds, ceph_session_state_name(session->s_state),
             session->s_seq);
        msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
        if (!msg)
@@ -1847,13 +1918,15 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        msg->front.iov_len = p - msg->front.iov_base;
        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
-       if (req->r_data_len) {
-               /* outbound data set only by ceph_sync_setxattr() */
-               BUG_ON(!req->r_pages);
-               ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
+       if (req->r_pagelist) {
+               struct ceph_pagelist *pagelist = req->r_pagelist;
+               atomic_inc(&pagelist->refcnt);
+               ceph_msg_data_add_pagelist(msg, pagelist);
+               msg->hdr.data_len = cpu_to_le32(pagelist->length);
+       } else {
+               msg->hdr.data_len = 0;
        }
 
-       msg->hdr.data_len = cpu_to_le32(req->r_data_len);
        msg->hdr.data_off = cpu_to_le16(0);
 
 out_free2:
@@ -2007,7 +2080,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
        req->r_session = get_session(session);
 
        dout("do_request mds%d session %p state %s\n", mds, session,
-            session_state_name(session->s_state));
+            ceph_session_state_name(session->s_state));
        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
            session->s_state != CEPH_MDS_SESSION_HUNG) {
                if (session->s_state == CEPH_MDS_SESSION_NEW ||
@@ -2078,6 +2151,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
                if (req->r_session &&
                    req->r_session->s_mds == mds) {
                        dout(" kicking tid %llu\n", req->r_tid);
+                       list_del_init(&req->r_wait);
                        __do_request(mdsc, req);
                }
        }
@@ -2444,7 +2518,7 @@ static void handle_session(struct ceph_mds_session *session,
 
        dout("handle_session mds%d %s %p state %s seq %llu\n",
             mds, ceph_session_op_name(op), session,
-            session_state_name(session->s_state), seq);
+            ceph_session_state_name(session->s_state), seq);
 
        if (session->s_state == CEPH_MDS_SESSION_HUNG) {
                session->s_state = CEPH_MDS_SESSION_OPEN;
@@ -2471,9 +2545,8 @@ static void handle_session(struct ceph_mds_session *session,
                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
-               wake = 1; /* for good measure */
+               wake = 2; /* for good measure */
                wake_up_all(&mdsc->session_close_wq);
-               kick_requests(mdsc, mds);
                break;
 
        case CEPH_SESSION_STALE:
@@ -2503,6 +2576,8 @@ static void handle_session(struct ceph_mds_session *session,
        if (wake) {
                mutex_lock(&mdsc->mutex);
                __wake_requests(mdsc, &session->s_waiting);
+               if (wake == 2)
+                       kick_requests(mdsc, mds);
                mutex_unlock(&mdsc->mutex);
        }
        return;
@@ -2695,18 +2770,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        session->s_state = CEPH_MDS_SESSION_RECONNECTING;
        session->s_seq = 0;
 
-       ceph_con_close(&session->s_con);
-       ceph_con_open(&session->s_con,
-                     CEPH_ENTITY_TYPE_MDS, mds,
-                     ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
-       /* replay unsafe requests */
-       replay_unsafe_requests(mdsc, session);
-
-       down_read(&mdsc->snap_rwsem);
-
        dout("session %p state %s\n", session,
-            session_state_name(session->s_state));
+            ceph_session_state_name(session->s_state));
 
        spin_lock(&session->s_gen_ttl_lock);
        session->s_cap_gen++;
@@ -2723,6 +2788,19 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        discard_cap_releases(mdsc, session);
        spin_unlock(&session->s_cap_lock);
 
+       /* trim unused caps to reduce MDS's cache rejoin time */
+       shrink_dcache_parent(mdsc->fsc->sb->s_root);
+
+       ceph_con_close(&session->s_con);
+       ceph_con_open(&session->s_con,
+                     CEPH_ENTITY_TYPE_MDS, mds,
+                     ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+       /* replay unsafe requests */
+       replay_unsafe_requests(mdsc, session);
+
+       down_read(&mdsc->snap_rwsem);
+
        /* traverse this session's caps */
        s_nr_caps = session->s_nr_caps;
        err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
@@ -2791,7 +2869,6 @@ fail:
        mutex_unlock(&session->s_mutex);
 fail_nomsg:
        ceph_pagelist_release(pagelist);
-       kfree(pagelist);
 fail_nopagelist:
        pr_err("error %d preparing reconnect for mds%d\n", err, mds);
        return;
@@ -2827,7 +2904,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
                     ceph_mds_state_name(newstate),
                     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
-                    session_state_name(s->s_state));
+                    ceph_session_state_name(s->s_state));
 
                if (i >= newmap->m_max_mds ||
                    memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2939,14 +3016,15 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        if (dname.len != get_unaligned_le32(h+1))
                goto bad;
 
-       mutex_lock(&session->s_mutex);
-       session->s_seq++;
-
        /* lookup inode */
        inode = ceph_find_inode(sb, vino);
        dout("handle_lease %s, ino %llx %p %.*s\n",
             ceph_lease_op_name(h->action), vino.ino, inode,
             dname.len, dname.name);
+
+       mutex_lock(&session->s_mutex);
+       session->s_seq++;
+
        if (inode == NULL) {
                dout("handle_lease no inode %llx\n", vino.ino);
                goto release;