x86/PCI: VMD: Fix infinite loop executing irq's
[cascardo/linux.git] / drivers / nvme / host / rdma.c
1 /*
2  * NVMe over Fabrics RDMA host code.
3  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/delay.h>
16 #include <linux/module.h>
17 #include <linux/init.h>
18 #include <linux/slab.h>
19 #include <linux/err.h>
20 #include <linux/string.h>
21 #include <linux/jiffies.h>
22 #include <linux/atomic.h>
23 #include <linux/blk-mq.h>
24 #include <linux/types.h>
25 #include <linux/list.h>
26 #include <linux/mutex.h>
27 #include <linux/scatterlist.h>
28 #include <linux/nvme.h>
29 #include <linux/t10-pi.h>
30 #include <asm/unaligned.h>
31
32 #include <rdma/ib_verbs.h>
33 #include <rdma/rdma_cm.h>
34 #include <rdma/ib_cm.h>
35 #include <linux/nvme-rdma.h>
36
37 #include "nvme.h"
38 #include "fabrics.h"
39
40
41 #define NVME_RDMA_CONNECT_TIMEOUT_MS    1000            /* 1 second */
42
43 #define NVME_RDMA_MAX_SEGMENT_SIZE      0xffffff        /* 24-bit SGL field */
44
45 #define NVME_RDMA_MAX_SEGMENTS          256
46
47 #define NVME_RDMA_MAX_INLINE_SEGMENTS   1
48
49 #define NVME_RDMA_MAX_PAGES_PER_MR      512
50
51 #define NVME_RDMA_DEF_RECONNECT_DELAY   20
52
53 /*
54  * We handle AEN commands ourselves and don't even let the
55  * block layer know about them.
56  */
57 #define NVME_RDMA_NR_AEN_COMMANDS      1
58 #define NVME_RDMA_AQ_BLKMQ_DEPTH       \
59         (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
60
61 struct nvme_rdma_device {
62         struct ib_device       *dev;
63         struct ib_pd           *pd;
64         struct ib_mr           *mr;
65         struct kref             ref;
66         struct list_head        entry;
67 };
68
69 struct nvme_rdma_qe {
70         struct ib_cqe           cqe;
71         void                    *data;
72         u64                     dma;
73 };
74
75 struct nvme_rdma_queue;
76 struct nvme_rdma_request {
77         struct ib_mr            *mr;
78         struct nvme_rdma_qe     sqe;
79         struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
80         u32                     num_sge;
81         int                     nents;
82         bool                    inline_data;
83         bool                    need_inval;
84         struct ib_reg_wr        reg_wr;
85         struct ib_cqe           reg_cqe;
86         struct nvme_rdma_queue  *queue;
87         struct sg_table         sg_table;
88         struct scatterlist      first_sgl[];
89 };
90
91 enum nvme_rdma_queue_flags {
92         NVME_RDMA_Q_CONNECTED = (1 << 0),
93 };
94
95 struct nvme_rdma_queue {
96         struct nvme_rdma_qe     *rsp_ring;
97         u8                      sig_count;
98         int                     queue_size;
99         size_t                  cmnd_capsule_len;
100         struct nvme_rdma_ctrl   *ctrl;
101         struct nvme_rdma_device *device;
102         struct ib_cq            *ib_cq;
103         struct ib_qp            *qp;
104
105         unsigned long           flags;
106         struct rdma_cm_id       *cm_id;
107         int                     cm_error;
108         struct completion       cm_done;
109 };
110
111 struct nvme_rdma_ctrl {
112         /* read and written in the hot path */
113         spinlock_t              lock;
114
115         /* read only in the hot path */
116         struct nvme_rdma_queue  *queues;
117         u32                     queue_count;
118
119         /* other member variables */
120         struct blk_mq_tag_set   tag_set;
121         struct work_struct      delete_work;
122         struct work_struct      reset_work;
123         struct work_struct      err_work;
124
125         struct nvme_rdma_qe     async_event_sqe;
126
127         int                     reconnect_delay;
128         struct delayed_work     reconnect_work;
129
130         struct list_head        list;
131
132         struct blk_mq_tag_set   admin_tag_set;
133         struct nvme_rdma_device *device;
134
135         u64                     cap;
136         u32                     max_fr_pages;
137
138         union {
139                 struct sockaddr addr;
140                 struct sockaddr_in addr_in;
141         };
142
143         struct nvme_ctrl        ctrl;
144 };
145
146 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
147 {
148         return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
149 }
150
151 static LIST_HEAD(device_list);
152 static DEFINE_MUTEX(device_list_mutex);
153
154 static LIST_HEAD(nvme_rdma_ctrl_list);
155 static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
156
157 static struct workqueue_struct *nvme_rdma_wq;
158
159 /*
160  * Disabling this option makes small I/O goes faster, but is fundamentally
161  * unsafe.  With it turned off we will have to register a global rkey that
162  * allows read and write access to all physical memory.
163  */
164 static bool register_always = true;
165 module_param(register_always, bool, 0444);
166 MODULE_PARM_DESC(register_always,
167          "Use memory registration even for contiguous memory regions");
168
169 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
170                 struct rdma_cm_event *event);
171 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
172 static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
173
174 /* XXX: really should move to a generic header sooner or later.. */
175 static inline void put_unaligned_le24(u32 val, u8 *p)
176 {
177         *p++ = val;
178         *p++ = val >> 8;
179         *p++ = val >> 16;
180 }
181
182 static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
183 {
184         return queue - queue->ctrl->queues;
185 }
186
187 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
188 {
189         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
190 }
191
192 static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
193                 size_t capsule_size, enum dma_data_direction dir)
194 {
195         ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
196         kfree(qe->data);
197 }
198
199 static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
200                 size_t capsule_size, enum dma_data_direction dir)
201 {
202         qe->data = kzalloc(capsule_size, GFP_KERNEL);
203         if (!qe->data)
204                 return -ENOMEM;
205
206         qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
207         if (ib_dma_mapping_error(ibdev, qe->dma)) {
208                 kfree(qe->data);
209                 return -ENOMEM;
210         }
211
212         return 0;
213 }
214
215 static void nvme_rdma_free_ring(struct ib_device *ibdev,
216                 struct nvme_rdma_qe *ring, size_t ib_queue_size,
217                 size_t capsule_size, enum dma_data_direction dir)
218 {
219         int i;
220
221         for (i = 0; i < ib_queue_size; i++)
222                 nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
223         kfree(ring);
224 }
225
226 static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
227                 size_t ib_queue_size, size_t capsule_size,
228                 enum dma_data_direction dir)
229 {
230         struct nvme_rdma_qe *ring;
231         int i;
232
233         ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
234         if (!ring)
235                 return NULL;
236
237         for (i = 0; i < ib_queue_size; i++) {
238                 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
239                         goto out_free_ring;
240         }
241
242         return ring;
243
244 out_free_ring:
245         nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
246         return NULL;
247 }
248
249 static void nvme_rdma_qp_event(struct ib_event *event, void *context)
250 {
251         pr_debug("QP event %d\n", event->event);
252 }
253
254 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
255 {
256         wait_for_completion_interruptible_timeout(&queue->cm_done,
257                         msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
258         return queue->cm_error;
259 }
260
261 static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
262 {
263         struct nvme_rdma_device *dev = queue->device;
264         struct ib_qp_init_attr init_attr;
265         int ret;
266
267         memset(&init_attr, 0, sizeof(init_attr));
268         init_attr.event_handler = nvme_rdma_qp_event;
269         /* +1 for drain */
270         init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
271         /* +1 for drain */
272         init_attr.cap.max_recv_wr = queue->queue_size + 1;
273         init_attr.cap.max_recv_sge = 1;
274         init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
275         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
276         init_attr.qp_type = IB_QPT_RC;
277         init_attr.send_cq = queue->ib_cq;
278         init_attr.recv_cq = queue->ib_cq;
279
280         ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
281
282         queue->qp = queue->cm_id->qp;
283         return ret;
284 }
285
286 static int nvme_rdma_reinit_request(void *data, struct request *rq)
287 {
288         struct nvme_rdma_ctrl *ctrl = data;
289         struct nvme_rdma_device *dev = ctrl->device;
290         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
291         int ret = 0;
292
293         if (!req->need_inval)
294                 goto out;
295
296         ib_dereg_mr(req->mr);
297
298         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
299                         ctrl->max_fr_pages);
300         if (IS_ERR(req->mr)) {
301                 ret = PTR_ERR(req->mr);
302                 req->mr = NULL;
303         }
304
305         req->need_inval = false;
306
307 out:
308         return ret;
309 }
310
311 static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl,
312                 struct request *rq, unsigned int queue_idx)
313 {
314         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
315         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
316         struct nvme_rdma_device *dev = queue->device;
317
318         if (req->mr)
319                 ib_dereg_mr(req->mr);
320
321         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
322                         DMA_TO_DEVICE);
323 }
324
325 static void nvme_rdma_exit_request(void *data, struct request *rq,
326                                 unsigned int hctx_idx, unsigned int rq_idx)
327 {
328         return __nvme_rdma_exit_request(data, rq, hctx_idx + 1);
329 }
330
331 static void nvme_rdma_exit_admin_request(void *data, struct request *rq,
332                                 unsigned int hctx_idx, unsigned int rq_idx)
333 {
334         return __nvme_rdma_exit_request(data, rq, 0);
335 }
336
337 static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl,
338                 struct request *rq, unsigned int queue_idx)
339 {
340         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
341         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
342         struct nvme_rdma_device *dev = queue->device;
343         struct ib_device *ibdev = dev->dev;
344         int ret;
345
346         BUG_ON(queue_idx >= ctrl->queue_count);
347
348         ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
349                         DMA_TO_DEVICE);
350         if (ret)
351                 return ret;
352
353         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
354                         ctrl->max_fr_pages);
355         if (IS_ERR(req->mr)) {
356                 ret = PTR_ERR(req->mr);
357                 goto out_free_qe;
358         }
359
360         req->queue = queue;
361
362         return 0;
363
364 out_free_qe:
365         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
366                         DMA_TO_DEVICE);
367         return -ENOMEM;
368 }
369
370 static int nvme_rdma_init_request(void *data, struct request *rq,
371                                 unsigned int hctx_idx, unsigned int rq_idx,
372                                 unsigned int numa_node)
373 {
374         return __nvme_rdma_init_request(data, rq, hctx_idx + 1);
375 }
376
377 static int nvme_rdma_init_admin_request(void *data, struct request *rq,
378                                 unsigned int hctx_idx, unsigned int rq_idx,
379                                 unsigned int numa_node)
380 {
381         return __nvme_rdma_init_request(data, rq, 0);
382 }
383
384 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
385                 unsigned int hctx_idx)
386 {
387         struct nvme_rdma_ctrl *ctrl = data;
388         struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
389
390         BUG_ON(hctx_idx >= ctrl->queue_count);
391
392         hctx->driver_data = queue;
393         return 0;
394 }
395
396 static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
397                 unsigned int hctx_idx)
398 {
399         struct nvme_rdma_ctrl *ctrl = data;
400         struct nvme_rdma_queue *queue = &ctrl->queues[0];
401
402         BUG_ON(hctx_idx != 0);
403
404         hctx->driver_data = queue;
405         return 0;
406 }
407
408 static void nvme_rdma_free_dev(struct kref *ref)
409 {
410         struct nvme_rdma_device *ndev =
411                 container_of(ref, struct nvme_rdma_device, ref);
412
413         mutex_lock(&device_list_mutex);
414         list_del(&ndev->entry);
415         mutex_unlock(&device_list_mutex);
416
417         if (!register_always)
418                 ib_dereg_mr(ndev->mr);
419         ib_dealloc_pd(ndev->pd);
420
421         kfree(ndev);
422 }
423
424 static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
425 {
426         kref_put(&dev->ref, nvme_rdma_free_dev);
427 }
428
429 static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
430 {
431         return kref_get_unless_zero(&dev->ref);
432 }
433
434 static struct nvme_rdma_device *
435 nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
436 {
437         struct nvme_rdma_device *ndev;
438
439         mutex_lock(&device_list_mutex);
440         list_for_each_entry(ndev, &device_list, entry) {
441                 if (ndev->dev->node_guid == cm_id->device->node_guid &&
442                     nvme_rdma_dev_get(ndev))
443                         goto out_unlock;
444         }
445
446         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
447         if (!ndev)
448                 goto out_err;
449
450         ndev->dev = cm_id->device;
451         kref_init(&ndev->ref);
452
453         ndev->pd = ib_alloc_pd(ndev->dev);
454         if (IS_ERR(ndev->pd))
455                 goto out_free_dev;
456
457         if (!register_always) {
458                 ndev->mr = ib_get_dma_mr(ndev->pd,
459                                             IB_ACCESS_LOCAL_WRITE |
460                                             IB_ACCESS_REMOTE_READ |
461                                             IB_ACCESS_REMOTE_WRITE);
462                 if (IS_ERR(ndev->mr))
463                         goto out_free_pd;
464         }
465
466         if (!(ndev->dev->attrs.device_cap_flags &
467               IB_DEVICE_MEM_MGT_EXTENSIONS)) {
468                 dev_err(&ndev->dev->dev,
469                         "Memory registrations not supported.\n");
470                 goto out_free_mr;
471         }
472
473         list_add(&ndev->entry, &device_list);
474 out_unlock:
475         mutex_unlock(&device_list_mutex);
476         return ndev;
477
478 out_free_mr:
479         if (!register_always)
480                 ib_dereg_mr(ndev->mr);
481 out_free_pd:
482         ib_dealloc_pd(ndev->pd);
483 out_free_dev:
484         kfree(ndev);
485 out_err:
486         mutex_unlock(&device_list_mutex);
487         return NULL;
488 }
489
490 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
491 {
492         struct nvme_rdma_device *dev = queue->device;
493         struct ib_device *ibdev = dev->dev;
494
495         rdma_destroy_qp(queue->cm_id);
496         ib_free_cq(queue->ib_cq);
497
498         nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
499                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
500
501         nvme_rdma_dev_put(dev);
502 }
503
504 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue,
505                 struct nvme_rdma_device *dev)
506 {
507         struct ib_device *ibdev = dev->dev;
508         const int send_wr_factor = 3;                   /* MR, SEND, INV */
509         const int cq_factor = send_wr_factor + 1;       /* + RECV */
510         int comp_vector, idx = nvme_rdma_queue_idx(queue);
511
512         int ret;
513
514         queue->device = dev;
515
516         /*
517          * The admin queue is barely used once the controller is live, so don't
518          * bother to spread it out.
519          */
520         if (idx == 0)
521                 comp_vector = 0;
522         else
523                 comp_vector = idx % ibdev->num_comp_vectors;
524
525
526         /* +1 for ib_stop_cq */
527         queue->ib_cq = ib_alloc_cq(dev->dev, queue,
528                                 cq_factor * queue->queue_size + 1, comp_vector,
529                                 IB_POLL_SOFTIRQ);
530         if (IS_ERR(queue->ib_cq)) {
531                 ret = PTR_ERR(queue->ib_cq);
532                 goto out;
533         }
534
535         ret = nvme_rdma_create_qp(queue, send_wr_factor);
536         if (ret)
537                 goto out_destroy_ib_cq;
538
539         queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
540                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
541         if (!queue->rsp_ring) {
542                 ret = -ENOMEM;
543                 goto out_destroy_qp;
544         }
545
546         return 0;
547
548 out_destroy_qp:
549         ib_destroy_qp(queue->qp);
550 out_destroy_ib_cq:
551         ib_free_cq(queue->ib_cq);
552 out:
553         return ret;
554 }
555
556 static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
557                 int idx, size_t queue_size)
558 {
559         struct nvme_rdma_queue *queue;
560         int ret;
561
562         queue = &ctrl->queues[idx];
563         queue->ctrl = ctrl;
564         init_completion(&queue->cm_done);
565
566         if (idx > 0)
567                 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
568         else
569                 queue->cmnd_capsule_len = sizeof(struct nvme_command);
570
571         queue->queue_size = queue_size;
572
573         queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
574                         RDMA_PS_TCP, IB_QPT_RC);
575         if (IS_ERR(queue->cm_id)) {
576                 dev_info(ctrl->ctrl.device,
577                         "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
578                 return PTR_ERR(queue->cm_id);
579         }
580
581         queue->cm_error = -ETIMEDOUT;
582         ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr,
583                         NVME_RDMA_CONNECT_TIMEOUT_MS);
584         if (ret) {
585                 dev_info(ctrl->ctrl.device,
586                         "rdma_resolve_addr failed (%d).\n", ret);
587                 goto out_destroy_cm_id;
588         }
589
590         ret = nvme_rdma_wait_for_cm(queue);
591         if (ret) {
592                 dev_info(ctrl->ctrl.device,
593                         "rdma_resolve_addr wait failed (%d).\n", ret);
594                 goto out_destroy_cm_id;
595         }
596
597         set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags);
598
599         return 0;
600
601 out_destroy_cm_id:
602         rdma_destroy_id(queue->cm_id);
603         return ret;
604 }
605
606 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
607 {
608         rdma_disconnect(queue->cm_id);
609         ib_drain_qp(queue->qp);
610 }
611
612 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
613 {
614         nvme_rdma_destroy_queue_ib(queue);
615         rdma_destroy_id(queue->cm_id);
616 }
617
618 static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue)
619 {
620         if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
621                 return;
622         nvme_rdma_stop_queue(queue);
623         nvme_rdma_free_queue(queue);
624 }
625
626 static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
627 {
628         int i;
629
630         for (i = 1; i < ctrl->queue_count; i++)
631                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
632 }
633
634 static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
635 {
636         int i, ret = 0;
637
638         for (i = 1; i < ctrl->queue_count; i++) {
639                 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
640                 if (ret)
641                         break;
642         }
643
644         return ret;
645 }
646
647 static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
648 {
649         int i, ret;
650
651         for (i = 1; i < ctrl->queue_count; i++) {
652                 ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.sqsize);
653                 if (ret) {
654                         dev_info(ctrl->ctrl.device,
655                                 "failed to initialize i/o queue: %d\n", ret);
656                         goto out_free_queues;
657                 }
658         }
659
660         return 0;
661
662 out_free_queues:
663         for (; i >= 1; i--)
664                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
665
666         return ret;
667 }
668
669 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
670 {
671         nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
672                         sizeof(struct nvme_command), DMA_TO_DEVICE);
673         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
674         blk_cleanup_queue(ctrl->ctrl.admin_q);
675         blk_mq_free_tag_set(&ctrl->admin_tag_set);
676         nvme_rdma_dev_put(ctrl->device);
677 }
678
679 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
680 {
681         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
682
683         if (list_empty(&ctrl->list))
684                 goto free_ctrl;
685
686         mutex_lock(&nvme_rdma_ctrl_mutex);
687         list_del(&ctrl->list);
688         mutex_unlock(&nvme_rdma_ctrl_mutex);
689
690         if (ctrl->ctrl.tagset) {
691                 blk_cleanup_queue(ctrl->ctrl.connect_q);
692                 blk_mq_free_tag_set(&ctrl->tag_set);
693                 nvme_rdma_dev_put(ctrl->device);
694         }
695         kfree(ctrl->queues);
696         nvmf_free_options(nctrl->opts);
697 free_ctrl:
698         kfree(ctrl);
699 }
700
701 static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
702 {
703         struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
704                         struct nvme_rdma_ctrl, reconnect_work);
705         bool changed;
706         int ret;
707
708         if (ctrl->queue_count > 1) {
709                 nvme_rdma_free_io_queues(ctrl);
710
711                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
712                 if (ret)
713                         goto requeue;
714         }
715
716         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
717
718         ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
719         if (ret)
720                 goto requeue;
721
722         ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
723         if (ret)
724                 goto requeue;
725
726         blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
727
728         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
729         if (ret)
730                 goto stop_admin_q;
731
732         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
733         if (ret)
734                 goto stop_admin_q;
735
736         nvme_start_keep_alive(&ctrl->ctrl);
737
738         if (ctrl->queue_count > 1) {
739                 ret = nvme_rdma_init_io_queues(ctrl);
740                 if (ret)
741                         goto stop_admin_q;
742
743                 ret = nvme_rdma_connect_io_queues(ctrl);
744                 if (ret)
745                         goto stop_admin_q;
746         }
747
748         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
749         WARN_ON_ONCE(!changed);
750
751         if (ctrl->queue_count > 1)
752                 nvme_start_queues(&ctrl->ctrl);
753
754         dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
755
756         return;
757
758 stop_admin_q:
759         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
760 requeue:
761         /* Make sure we are not resetting/deleting */
762         if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
763                 dev_info(ctrl->ctrl.device,
764                         "Failed reconnect attempt, requeueing...\n");
765                 queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
766                                         ctrl->reconnect_delay * HZ);
767         }
768 }
769
770 static void nvme_rdma_error_recovery_work(struct work_struct *work)
771 {
772         struct nvme_rdma_ctrl *ctrl = container_of(work,
773                         struct nvme_rdma_ctrl, err_work);
774
775         nvme_stop_keep_alive(&ctrl->ctrl);
776         if (ctrl->queue_count > 1)
777                 nvme_stop_queues(&ctrl->ctrl);
778         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
779
780         /* We must take care of fastfail/requeue all our inflight requests */
781         if (ctrl->queue_count > 1)
782                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
783                                         nvme_cancel_request, &ctrl->ctrl);
784         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
785                                 nvme_cancel_request, &ctrl->ctrl);
786
787         dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
788                 ctrl->reconnect_delay);
789
790         queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
791                                 ctrl->reconnect_delay * HZ);
792 }
793
794 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
795 {
796         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
797                 return;
798
799         queue_work(nvme_rdma_wq, &ctrl->err_work);
800 }
801
802 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
803                 const char *op)
804 {
805         struct nvme_rdma_queue *queue = cq->cq_context;
806         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
807
808         if (ctrl->ctrl.state == NVME_CTRL_LIVE)
809                 dev_info(ctrl->ctrl.device,
810                              "%s for CQE 0x%p failed with status %s (%d)\n",
811                              op, wc->wr_cqe,
812                              ib_wc_status_msg(wc->status), wc->status);
813         nvme_rdma_error_recovery(ctrl);
814 }
815
816 static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
817 {
818         if (unlikely(wc->status != IB_WC_SUCCESS))
819                 nvme_rdma_wr_error(cq, wc, "MEMREG");
820 }
821
822 static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
823 {
824         if (unlikely(wc->status != IB_WC_SUCCESS))
825                 nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
826 }
827
828 static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
829                 struct nvme_rdma_request *req)
830 {
831         struct ib_send_wr *bad_wr;
832         struct ib_send_wr wr = {
833                 .opcode             = IB_WR_LOCAL_INV,
834                 .next               = NULL,
835                 .num_sge            = 0,
836                 .send_flags         = 0,
837                 .ex.invalidate_rkey = req->mr->rkey,
838         };
839
840         req->reg_cqe.done = nvme_rdma_inv_rkey_done;
841         wr.wr_cqe = &req->reg_cqe;
842
843         return ib_post_send(queue->qp, &wr, &bad_wr);
844 }
845
846 static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
847                 struct request *rq)
848 {
849         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
850         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
851         struct nvme_rdma_device *dev = queue->device;
852         struct ib_device *ibdev = dev->dev;
853         int res;
854
855         if (!blk_rq_bytes(rq))
856                 return;
857
858         if (req->need_inval) {
859                 res = nvme_rdma_inv_rkey(queue, req);
860                 if (res < 0) {
861                         dev_err(ctrl->ctrl.device,
862                                 "Queueing INV WR for rkey %#x failed (%d)\n",
863                                 req->mr->rkey, res);
864                         nvme_rdma_error_recovery(queue->ctrl);
865                 }
866         }
867
868         ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
869                         req->nents, rq_data_dir(rq) ==
870                                     WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
871
872         nvme_cleanup_cmd(rq);
873         sg_free_table_chained(&req->sg_table, true);
874 }
875
876 static int nvme_rdma_set_sg_null(struct nvme_command *c)
877 {
878         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
879
880         sg->addr = 0;
881         put_unaligned_le24(0, sg->length);
882         put_unaligned_le32(0, sg->key);
883         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
884         return 0;
885 }
886
887 static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
888                 struct nvme_rdma_request *req, struct nvme_command *c)
889 {
890         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
891
892         req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
893         req->sge[1].length = sg_dma_len(req->sg_table.sgl);
894         req->sge[1].lkey = queue->device->pd->local_dma_lkey;
895
896         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
897         sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
898         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
899
900         req->inline_data = true;
901         req->num_sge++;
902         return 0;
903 }
904
905 static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
906                 struct nvme_rdma_request *req, struct nvme_command *c)
907 {
908         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
909
910         sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
911         put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
912         put_unaligned_le32(queue->device->mr->rkey, sg->key);
913         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
914         return 0;
915 }
916
917 static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
918                 struct nvme_rdma_request *req, struct nvme_command *c,
919                 int count)
920 {
921         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
922         int nr;
923
924         nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
925         if (nr < count) {
926                 if (nr < 0)
927                         return nr;
928                 return -EINVAL;
929         }
930
931         ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
932
933         req->reg_cqe.done = nvme_rdma_memreg_done;
934         memset(&req->reg_wr, 0, sizeof(req->reg_wr));
935         req->reg_wr.wr.opcode = IB_WR_REG_MR;
936         req->reg_wr.wr.wr_cqe = &req->reg_cqe;
937         req->reg_wr.wr.num_sge = 0;
938         req->reg_wr.mr = req->mr;
939         req->reg_wr.key = req->mr->rkey;
940         req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
941                              IB_ACCESS_REMOTE_READ |
942                              IB_ACCESS_REMOTE_WRITE;
943
944         req->need_inval = true;
945
946         sg->addr = cpu_to_le64(req->mr->iova);
947         put_unaligned_le24(req->mr->length, sg->length);
948         put_unaligned_le32(req->mr->rkey, sg->key);
949         sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
950                         NVME_SGL_FMT_INVALIDATE;
951
952         return 0;
953 }
954
955 static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
956                 struct request *rq, unsigned int map_len,
957                 struct nvme_command *c)
958 {
959         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
960         struct nvme_rdma_device *dev = queue->device;
961         struct ib_device *ibdev = dev->dev;
962         int nents, count;
963         int ret;
964
965         req->num_sge = 1;
966         req->inline_data = false;
967         req->need_inval = false;
968
969         c->common.flags |= NVME_CMD_SGL_METABUF;
970
971         if (!blk_rq_bytes(rq))
972                 return nvme_rdma_set_sg_null(c);
973
974         req->sg_table.sgl = req->first_sgl;
975         ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments,
976                                 req->sg_table.sgl);
977         if (ret)
978                 return -ENOMEM;
979
980         nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
981         BUG_ON(nents > rq->nr_phys_segments);
982         req->nents = nents;
983
984         count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents,
985                     rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
986         if (unlikely(count <= 0)) {
987                 sg_free_table_chained(&req->sg_table, true);
988                 return -EIO;
989         }
990
991         if (count == 1) {
992                 if (rq_data_dir(rq) == WRITE &&
993                     map_len <= nvme_rdma_inline_data_size(queue) &&
994                     nvme_rdma_queue_idx(queue))
995                         return nvme_rdma_map_sg_inline(queue, req, c);
996
997                 if (!register_always)
998                         return nvme_rdma_map_sg_single(queue, req, c);
999         }
1000
1001         return nvme_rdma_map_sg_fr(queue, req, c, count);
1002 }
1003
1004 static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1005 {
1006         if (unlikely(wc->status != IB_WC_SUCCESS))
1007                 nvme_rdma_wr_error(cq, wc, "SEND");
1008 }
1009
1010 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1011                 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1012                 struct ib_send_wr *first, bool flush)
1013 {
1014         struct ib_send_wr wr, *bad_wr;
1015         int ret;
1016
1017         sge->addr   = qe->dma;
1018         sge->length = sizeof(struct nvme_command),
1019         sge->lkey   = queue->device->pd->local_dma_lkey;
1020
1021         qe->cqe.done = nvme_rdma_send_done;
1022
1023         wr.next       = NULL;
1024         wr.wr_cqe     = &qe->cqe;
1025         wr.sg_list    = sge;
1026         wr.num_sge    = num_sge;
1027         wr.opcode     = IB_WR_SEND;
1028         wr.send_flags = 0;
1029
1030         /*
1031          * Unsignalled send completions are another giant desaster in the
1032          * IB Verbs spec:  If we don't regularly post signalled sends
1033          * the send queue will fill up and only a QP reset will rescue us.
1034          * Would have been way to obvious to handle this in hardware or
1035          * at least the RDMA stack..
1036          *
1037          * This messy and racy code sniplet is copy and pasted from the iSER
1038          * initiator, and the magic '32' comes from there as well.
1039          *
1040          * Always signal the flushes. The magic request used for the flush
1041          * sequencer is not allocated in our driver's tagset and it's
1042          * triggered to be freed by blk_cleanup_queue(). So we need to
1043          * always mark it as signaled to ensure that the "wr_cqe", which is
1044          * embeded in request's payload, is not freed when __ib_process_cq()
1045          * calls wr_cqe->done().
1046          */
1047         if ((++queue->sig_count % 32) == 0 || flush)
1048                 wr.send_flags |= IB_SEND_SIGNALED;
1049
1050         if (first)
1051                 first->next = &wr;
1052         else
1053                 first = &wr;
1054
1055         ret = ib_post_send(queue->qp, first, &bad_wr);
1056         if (ret) {
1057                 dev_err(queue->ctrl->ctrl.device,
1058                              "%s failed with error code %d\n", __func__, ret);
1059         }
1060         return ret;
1061 }
1062
1063 static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1064                 struct nvme_rdma_qe *qe)
1065 {
1066         struct ib_recv_wr wr, *bad_wr;
1067         struct ib_sge list;
1068         int ret;
1069
1070         list.addr   = qe->dma;
1071         list.length = sizeof(struct nvme_completion);
1072         list.lkey   = queue->device->pd->local_dma_lkey;
1073
1074         qe->cqe.done = nvme_rdma_recv_done;
1075
1076         wr.next     = NULL;
1077         wr.wr_cqe   = &qe->cqe;
1078         wr.sg_list  = &list;
1079         wr.num_sge  = 1;
1080
1081         ret = ib_post_recv(queue->qp, &wr, &bad_wr);
1082         if (ret) {
1083                 dev_err(queue->ctrl->ctrl.device,
1084                         "%s failed with error code %d\n", __func__, ret);
1085         }
1086         return ret;
1087 }
1088
1089 static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1090 {
1091         u32 queue_idx = nvme_rdma_queue_idx(queue);
1092
1093         if (queue_idx == 0)
1094                 return queue->ctrl->admin_tag_set.tags[queue_idx];
1095         return queue->ctrl->tag_set.tags[queue_idx - 1];
1096 }
1097
1098 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
1099 {
1100         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1101         struct nvme_rdma_queue *queue = &ctrl->queues[0];
1102         struct ib_device *dev = queue->device->dev;
1103         struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1104         struct nvme_command *cmd = sqe->data;
1105         struct ib_sge sge;
1106         int ret;
1107
1108         if (WARN_ON_ONCE(aer_idx != 0))
1109                 return;
1110
1111         ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1112
1113         memset(cmd, 0, sizeof(*cmd));
1114         cmd->common.opcode = nvme_admin_async_event;
1115         cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
1116         cmd->common.flags |= NVME_CMD_SGL_METABUF;
1117         nvme_rdma_set_sg_null(cmd);
1118
1119         ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1120                         DMA_TO_DEVICE);
1121
1122         ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
1123         WARN_ON_ONCE(ret);
1124 }
1125
1126 static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1127                 struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1128 {
1129         u16 status = le16_to_cpu(cqe->status);
1130         struct request *rq;
1131         struct nvme_rdma_request *req;
1132         int ret = 0;
1133
1134         status >>= 1;
1135
1136         rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1137         if (!rq) {
1138                 dev_err(queue->ctrl->ctrl.device,
1139                         "tag 0x%x on QP %#x not found\n",
1140                         cqe->command_id, queue->qp->qp_num);
1141                 nvme_rdma_error_recovery(queue->ctrl);
1142                 return ret;
1143         }
1144         req = blk_mq_rq_to_pdu(rq);
1145
1146         if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
1147                 memcpy(rq->special, cqe, sizeof(*cqe));
1148
1149         if (rq->tag == tag)
1150                 ret = 1;
1151
1152         if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
1153             wc->ex.invalidate_rkey == req->mr->rkey)
1154                 req->need_inval = false;
1155
1156         blk_mq_complete_request(rq, status);
1157
1158         return ret;
1159 }
1160
1161 static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1162 {
1163         struct nvme_rdma_qe *qe =
1164                 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1165         struct nvme_rdma_queue *queue = cq->cq_context;
1166         struct ib_device *ibdev = queue->device->dev;
1167         struct nvme_completion *cqe = qe->data;
1168         const size_t len = sizeof(struct nvme_completion);
1169         int ret = 0;
1170
1171         if (unlikely(wc->status != IB_WC_SUCCESS)) {
1172                 nvme_rdma_wr_error(cq, wc, "RECV");
1173                 return 0;
1174         }
1175
1176         ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1177         /*
1178          * AEN requests are special as they don't time out and can
1179          * survive any kind of queue freeze and often don't respond to
1180          * aborts.  We don't even bother to allocate a struct request
1181          * for them but rather special case them here.
1182          */
1183         if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1184                         cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
1185                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
1186         else
1187                 ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1188         ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1189
1190         nvme_rdma_post_recv(queue, qe);
1191         return ret;
1192 }
1193
1194 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1195 {
1196         __nvme_rdma_recv_done(cq, wc, -1);
1197 }
1198
1199 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1200 {
1201         int ret, i;
1202
1203         for (i = 0; i < queue->queue_size; i++) {
1204                 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1205                 if (ret)
1206                         goto out_destroy_queue_ib;
1207         }
1208
1209         return 0;
1210
1211 out_destroy_queue_ib:
1212         nvme_rdma_destroy_queue_ib(queue);
1213         return ret;
1214 }
1215
1216 static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1217                 struct rdma_cm_event *ev)
1218 {
1219         if (ev->param.conn.private_data_len) {
1220                 struct nvme_rdma_cm_rej *rej =
1221                         (struct nvme_rdma_cm_rej *)ev->param.conn.private_data;
1222
1223                 dev_err(queue->ctrl->ctrl.device,
1224                         "Connect rejected, status %d.", le16_to_cpu(rej->sts));
1225                 /* XXX: Think of something clever to do here... */
1226         } else {
1227                 dev_err(queue->ctrl->ctrl.device,
1228                         "Connect rejected, no private data.\n");
1229         }
1230
1231         return -ECONNRESET;
1232 }
1233
1234 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1235 {
1236         struct nvme_rdma_device *dev;
1237         int ret;
1238
1239         dev = nvme_rdma_find_get_device(queue->cm_id);
1240         if (!dev) {
1241                 dev_err(queue->cm_id->device->dma_device,
1242                         "no client data found!\n");
1243                 return -ECONNREFUSED;
1244         }
1245
1246         ret = nvme_rdma_create_queue_ib(queue, dev);
1247         if (ret) {
1248                 nvme_rdma_dev_put(dev);
1249                 goto out;
1250         }
1251
1252         ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1253         if (ret) {
1254                 dev_err(queue->ctrl->ctrl.device,
1255                         "rdma_resolve_route failed (%d).\n",
1256                         queue->cm_error);
1257                 goto out_destroy_queue;
1258         }
1259
1260         return 0;
1261
1262 out_destroy_queue:
1263         nvme_rdma_destroy_queue_ib(queue);
1264 out:
1265         return ret;
1266 }
1267
1268 static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1269 {
1270         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1271         struct rdma_conn_param param = { };
1272         struct nvme_rdma_cm_req priv;
1273         int ret;
1274
1275         param.qp_num = queue->qp->qp_num;
1276         param.flow_control = 1;
1277
1278         param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1279         /* maximum retry count */
1280         param.retry_count = 7;
1281         param.rnr_retry_count = 7;
1282         param.private_data = &priv;
1283         param.private_data_len = sizeof(priv);
1284
1285         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1286         priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1287         priv.hrqsize = cpu_to_le16(queue->queue_size);
1288         priv.hsqsize = cpu_to_le16(queue->queue_size);
1289
1290         ret = rdma_connect(queue->cm_id, &param);
1291         if (ret) {
1292                 dev_err(ctrl->ctrl.device,
1293                         "rdma_connect failed (%d).\n", ret);
1294                 goto out_destroy_queue_ib;
1295         }
1296
1297         return 0;
1298
1299 out_destroy_queue_ib:
1300         nvme_rdma_destroy_queue_ib(queue);
1301         return ret;
1302 }
1303
1304 /**
1305  * nvme_rdma_device_unplug() - Handle RDMA device unplug
1306  * @queue:      Queue that owns the cm_id that caught the event
1307  *
1308  * DEVICE_REMOVAL event notifies us that the RDMA device is about
1309  * to unplug so we should take care of destroying our RDMA resources.
1310  * This event will be generated for each allocated cm_id.
1311  *
1312  * In our case, the RDMA resources are managed per controller and not
1313  * only per queue. So the way we handle this is we trigger an implicit
1314  * controller deletion upon the first DEVICE_REMOVAL event we see, and
1315  * hold the event inflight until the controller deletion is completed.
1316  *
1317  * One exception that we need to handle is the destruction of the cm_id
1318  * that caught the event. Since we hold the callout until the controller
1319  * deletion is completed, we'll deadlock if the controller deletion will
1320  * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
1321  * of destroying this queue before-hand, destroy the queue resources
1322  * after the controller deletion completed with the exception of destroying
1323  * the cm_id implicitely by returning a non-zero rc to the callout.
1324  */
1325 static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
1326 {
1327         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1328         int ret, ctrl_deleted = 0;
1329
1330         /* First disable the queue so ctrl delete won't free it */
1331         if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
1332                 goto out;
1333
1334         /* delete the controller */
1335         ret = __nvme_rdma_del_ctrl(ctrl);
1336         if (!ret) {
1337                 dev_warn(ctrl->ctrl.device,
1338                         "Got rdma device removal event, deleting ctrl\n");
1339                 flush_work(&ctrl->delete_work);
1340
1341                 /* Return non-zero so the cm_id will destroy implicitly */
1342                 ctrl_deleted = 1;
1343
1344                 /* Free this queue ourselves */
1345                 rdma_disconnect(queue->cm_id);
1346                 ib_drain_qp(queue->qp);
1347                 nvme_rdma_destroy_queue_ib(queue);
1348         }
1349
1350 out:
1351         return ctrl_deleted;
1352 }
1353
1354 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1355                 struct rdma_cm_event *ev)
1356 {
1357         struct nvme_rdma_queue *queue = cm_id->context;
1358         int cm_error = 0;
1359
1360         dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1361                 rdma_event_msg(ev->event), ev->event,
1362                 ev->status, cm_id);
1363
1364         switch (ev->event) {
1365         case RDMA_CM_EVENT_ADDR_RESOLVED:
1366                 cm_error = nvme_rdma_addr_resolved(queue);
1367                 break;
1368         case RDMA_CM_EVENT_ROUTE_RESOLVED:
1369                 cm_error = nvme_rdma_route_resolved(queue);
1370                 break;
1371         case RDMA_CM_EVENT_ESTABLISHED:
1372                 queue->cm_error = nvme_rdma_conn_established(queue);
1373                 /* complete cm_done regardless of success/failure */
1374                 complete(&queue->cm_done);
1375                 return 0;
1376         case RDMA_CM_EVENT_REJECTED:
1377                 cm_error = nvme_rdma_conn_rejected(queue, ev);
1378                 break;
1379         case RDMA_CM_EVENT_ADDR_ERROR:
1380         case RDMA_CM_EVENT_ROUTE_ERROR:
1381         case RDMA_CM_EVENT_CONNECT_ERROR:
1382         case RDMA_CM_EVENT_UNREACHABLE:
1383                 dev_dbg(queue->ctrl->ctrl.device,
1384                         "CM error event %d\n", ev->event);
1385                 cm_error = -ECONNRESET;
1386                 break;
1387         case RDMA_CM_EVENT_DISCONNECTED:
1388         case RDMA_CM_EVENT_ADDR_CHANGE:
1389         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1390                 dev_dbg(queue->ctrl->ctrl.device,
1391                         "disconnect received - connection closed\n");
1392                 nvme_rdma_error_recovery(queue->ctrl);
1393                 break;
1394         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1395                 /* return 1 means impliciy CM ID destroy */
1396                 return nvme_rdma_device_unplug(queue);
1397         default:
1398                 dev_err(queue->ctrl->ctrl.device,
1399                         "Unexpected RDMA CM event (%d)\n", ev->event);
1400                 nvme_rdma_error_recovery(queue->ctrl);
1401                 break;
1402         }
1403
1404         if (cm_error) {
1405                 queue->cm_error = cm_error;
1406                 complete(&queue->cm_done);
1407         }
1408
1409         return 0;
1410 }
1411
1412 static enum blk_eh_timer_return
1413 nvme_rdma_timeout(struct request *rq, bool reserved)
1414 {
1415         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1416
1417         /* queue error recovery */
1418         nvme_rdma_error_recovery(req->queue->ctrl);
1419
1420         /* fail with DNR on cmd timeout */
1421         rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
1422
1423         return BLK_EH_HANDLED;
1424 }
1425
1426 static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1427                 const struct blk_mq_queue_data *bd)
1428 {
1429         struct nvme_ns *ns = hctx->queue->queuedata;
1430         struct nvme_rdma_queue *queue = hctx->driver_data;
1431         struct request *rq = bd->rq;
1432         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1433         struct nvme_rdma_qe *sqe = &req->sqe;
1434         struct nvme_command *c = sqe->data;
1435         bool flush = false;
1436         struct ib_device *dev;
1437         unsigned int map_len;
1438         int ret;
1439
1440         WARN_ON_ONCE(rq->tag < 0);
1441
1442         dev = queue->device->dev;
1443         ib_dma_sync_single_for_cpu(dev, sqe->dma,
1444                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1445
1446         ret = nvme_setup_cmd(ns, rq, c);
1447         if (ret)
1448                 return ret;
1449
1450         c->common.command_id = rq->tag;
1451         blk_mq_start_request(rq);
1452
1453         map_len = nvme_map_len(rq);
1454         ret = nvme_rdma_map_data(queue, rq, map_len, c);
1455         if (ret < 0) {
1456                 dev_err(queue->ctrl->ctrl.device,
1457                              "Failed to map data (%d)\n", ret);
1458                 nvme_cleanup_cmd(rq);
1459                 goto err;
1460         }
1461
1462         ib_dma_sync_single_for_device(dev, sqe->dma,
1463                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1464
1465         if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH)
1466                 flush = true;
1467         ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1468                         req->need_inval ? &req->reg_wr.wr : NULL, flush);
1469         if (ret) {
1470                 nvme_rdma_unmap_data(queue, rq);
1471                 goto err;
1472         }
1473
1474         return BLK_MQ_RQ_QUEUE_OK;
1475 err:
1476         return (ret == -ENOMEM || ret == -EAGAIN) ?
1477                 BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
1478 }
1479
1480 static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1481 {
1482         struct nvme_rdma_queue *queue = hctx->driver_data;
1483         struct ib_cq *cq = queue->ib_cq;
1484         struct ib_wc wc;
1485         int found = 0;
1486
1487         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
1488         while (ib_poll_cq(cq, 1, &wc) > 0) {
1489                 struct ib_cqe *cqe = wc.wr_cqe;
1490
1491                 if (cqe) {
1492                         if (cqe->done == nvme_rdma_recv_done)
1493                                 found |= __nvme_rdma_recv_done(cq, &wc, tag);
1494                         else
1495                                 cqe->done(cq, &wc);
1496                 }
1497         }
1498
1499         return found;
1500 }
1501
1502 static void nvme_rdma_complete_rq(struct request *rq)
1503 {
1504         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1505         struct nvme_rdma_queue *queue = req->queue;
1506         int error = 0;
1507
1508         nvme_rdma_unmap_data(queue, rq);
1509
1510         if (unlikely(rq->errors)) {
1511                 if (nvme_req_needs_retry(rq, rq->errors)) {
1512                         nvme_requeue_req(rq);
1513                         return;
1514                 }
1515
1516                 if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
1517                         error = rq->errors;
1518                 else
1519                         error = nvme_error_status(rq->errors);
1520         }
1521
1522         blk_mq_end_request(rq, error);
1523 }
1524
1525 static struct blk_mq_ops nvme_rdma_mq_ops = {
1526         .queue_rq       = nvme_rdma_queue_rq,
1527         .complete       = nvme_rdma_complete_rq,
1528         .map_queue      = blk_mq_map_queue,
1529         .init_request   = nvme_rdma_init_request,
1530         .exit_request   = nvme_rdma_exit_request,
1531         .reinit_request = nvme_rdma_reinit_request,
1532         .init_hctx      = nvme_rdma_init_hctx,
1533         .poll           = nvme_rdma_poll,
1534         .timeout        = nvme_rdma_timeout,
1535 };
1536
1537 static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1538         .queue_rq       = nvme_rdma_queue_rq,
1539         .complete       = nvme_rdma_complete_rq,
1540         .map_queue      = blk_mq_map_queue,
1541         .init_request   = nvme_rdma_init_admin_request,
1542         .exit_request   = nvme_rdma_exit_admin_request,
1543         .reinit_request = nvme_rdma_reinit_request,
1544         .init_hctx      = nvme_rdma_init_admin_hctx,
1545         .timeout        = nvme_rdma_timeout,
1546 };
1547
1548 static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
1549 {
1550         int error;
1551
1552         error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
1553         if (error)
1554                 return error;
1555
1556         ctrl->device = ctrl->queues[0].device;
1557
1558         /*
1559          * We need a reference on the device as long as the tag_set is alive,
1560          * as the MRs in the request structures need a valid ib_device.
1561          */
1562         error = -EINVAL;
1563         if (!nvme_rdma_dev_get(ctrl->device))
1564                 goto out_free_queue;
1565
1566         ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
1567                 ctrl->device->dev->attrs.max_fast_reg_page_list_len);
1568
1569         memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
1570         ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops;
1571         ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
1572         ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
1573         ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
1574         ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1575                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1576         ctrl->admin_tag_set.driver_data = ctrl;
1577         ctrl->admin_tag_set.nr_hw_queues = 1;
1578         ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
1579
1580         error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
1581         if (error)
1582                 goto out_put_dev;
1583
1584         ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
1585         if (IS_ERR(ctrl->ctrl.admin_q)) {
1586                 error = PTR_ERR(ctrl->ctrl.admin_q);
1587                 goto out_free_tagset;
1588         }
1589
1590         error = nvmf_connect_admin_queue(&ctrl->ctrl);
1591         if (error)
1592                 goto out_cleanup_queue;
1593
1594         error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
1595         if (error) {
1596                 dev_err(ctrl->ctrl.device,
1597                         "prop_get NVME_REG_CAP failed\n");
1598                 goto out_cleanup_queue;
1599         }
1600
1601         ctrl->ctrl.sqsize =
1602                 min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
1603
1604         error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
1605         if (error)
1606                 goto out_cleanup_queue;
1607
1608         ctrl->ctrl.max_hw_sectors =
1609                 (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
1610
1611         error = nvme_init_identify(&ctrl->ctrl);
1612         if (error)
1613                 goto out_cleanup_queue;
1614
1615         error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
1616                         &ctrl->async_event_sqe, sizeof(struct nvme_command),
1617                         DMA_TO_DEVICE);
1618         if (error)
1619                 goto out_cleanup_queue;
1620
1621         nvme_start_keep_alive(&ctrl->ctrl);
1622
1623         return 0;
1624
1625 out_cleanup_queue:
1626         blk_cleanup_queue(ctrl->ctrl.admin_q);
1627 out_free_tagset:
1628         /* disconnect and drain the queue before freeing the tagset */
1629         nvme_rdma_stop_queue(&ctrl->queues[0]);
1630         blk_mq_free_tag_set(&ctrl->admin_tag_set);
1631 out_put_dev:
1632         nvme_rdma_dev_put(ctrl->device);
1633 out_free_queue:
1634         nvme_rdma_free_queue(&ctrl->queues[0]);
1635         return error;
1636 }
1637
1638 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
1639 {
1640         nvme_stop_keep_alive(&ctrl->ctrl);
1641         cancel_work_sync(&ctrl->err_work);
1642         cancel_delayed_work_sync(&ctrl->reconnect_work);
1643
1644         if (ctrl->queue_count > 1) {
1645                 nvme_stop_queues(&ctrl->ctrl);
1646                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
1647                                         nvme_cancel_request, &ctrl->ctrl);
1648                 nvme_rdma_free_io_queues(ctrl);
1649         }
1650
1651         if (ctrl->ctrl.state == NVME_CTRL_LIVE)
1652                 nvme_shutdown_ctrl(&ctrl->ctrl);
1653
1654         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
1655         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
1656                                 nvme_cancel_request, &ctrl->ctrl);
1657         nvme_rdma_destroy_admin_queue(ctrl);
1658 }
1659
1660 static void nvme_rdma_del_ctrl_work(struct work_struct *work)
1661 {
1662         struct nvme_rdma_ctrl *ctrl = container_of(work,
1663                                 struct nvme_rdma_ctrl, delete_work);
1664
1665         nvme_remove_namespaces(&ctrl->ctrl);
1666         nvme_rdma_shutdown_ctrl(ctrl);
1667         nvme_uninit_ctrl(&ctrl->ctrl);
1668         nvme_put_ctrl(&ctrl->ctrl);
1669 }
1670
1671 static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
1672 {
1673         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
1674                 return -EBUSY;
1675
1676         if (!queue_work(nvme_rdma_wq, &ctrl->delete_work))
1677                 return -EBUSY;
1678
1679         return 0;
1680 }
1681
1682 static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
1683 {
1684         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1685         int ret;
1686
1687         ret = __nvme_rdma_del_ctrl(ctrl);
1688         if (ret)
1689                 return ret;
1690
1691         flush_work(&ctrl->delete_work);
1692
1693         return 0;
1694 }
1695
1696 static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
1697 {
1698         struct nvme_rdma_ctrl *ctrl = container_of(work,
1699                                 struct nvme_rdma_ctrl, delete_work);
1700
1701         nvme_remove_namespaces(&ctrl->ctrl);
1702         nvme_uninit_ctrl(&ctrl->ctrl);
1703         nvme_put_ctrl(&ctrl->ctrl);
1704 }
1705
1706 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1707 {
1708         struct nvme_rdma_ctrl *ctrl = container_of(work,
1709                                         struct nvme_rdma_ctrl, reset_work);
1710         int ret;
1711         bool changed;
1712
1713         nvme_rdma_shutdown_ctrl(ctrl);
1714
1715         ret = nvme_rdma_configure_admin_queue(ctrl);
1716         if (ret) {
1717                 /* ctrl is already shutdown, just remove the ctrl */
1718                 INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work);
1719                 goto del_dead_ctrl;
1720         }
1721
1722         if (ctrl->queue_count > 1) {
1723                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
1724                 if (ret)
1725                         goto del_dead_ctrl;
1726
1727                 ret = nvme_rdma_init_io_queues(ctrl);
1728                 if (ret)
1729                         goto del_dead_ctrl;
1730
1731                 ret = nvme_rdma_connect_io_queues(ctrl);
1732                 if (ret)
1733                         goto del_dead_ctrl;
1734         }
1735
1736         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1737         WARN_ON_ONCE(!changed);
1738
1739         if (ctrl->queue_count > 1) {
1740                 nvme_start_queues(&ctrl->ctrl);
1741                 nvme_queue_scan(&ctrl->ctrl);
1742         }
1743
1744         return;
1745
1746 del_dead_ctrl:
1747         /* Deleting this dead controller... */
1748         dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
1749         WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work));
1750 }
1751
1752 static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl)
1753 {
1754         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1755
1756         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1757                 return -EBUSY;
1758
1759         if (!queue_work(nvme_rdma_wq, &ctrl->reset_work))
1760                 return -EBUSY;
1761
1762         flush_work(&ctrl->reset_work);
1763
1764         return 0;
1765 }
1766
1767 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1768         .name                   = "rdma",
1769         .module                 = THIS_MODULE,
1770         .is_fabrics             = true,
1771         .reg_read32             = nvmf_reg_read32,
1772         .reg_read64             = nvmf_reg_read64,
1773         .reg_write32            = nvmf_reg_write32,
1774         .reset_ctrl             = nvme_rdma_reset_ctrl,
1775         .free_ctrl              = nvme_rdma_free_ctrl,
1776         .submit_async_event     = nvme_rdma_submit_async_event,
1777         .delete_ctrl            = nvme_rdma_del_ctrl,
1778         .get_subsysnqn          = nvmf_get_subsysnqn,
1779         .get_address            = nvmf_get_address,
1780 };
1781
1782 static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
1783 {
1784         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
1785         int ret;
1786
1787         ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
1788         if (ret)
1789                 return ret;
1790
1791         ctrl->queue_count = opts->nr_io_queues + 1;
1792         if (ctrl->queue_count < 2)
1793                 return 0;
1794
1795         dev_info(ctrl->ctrl.device,
1796                 "creating %d I/O queues.\n", opts->nr_io_queues);
1797
1798         ret = nvme_rdma_init_io_queues(ctrl);
1799         if (ret)
1800                 return ret;
1801
1802         /*
1803          * We need a reference on the device as long as the tag_set is alive,
1804          * as the MRs in the request structures need a valid ib_device.
1805          */
1806         ret = -EINVAL;
1807         if (!nvme_rdma_dev_get(ctrl->device))
1808                 goto out_free_io_queues;
1809
1810         memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
1811         ctrl->tag_set.ops = &nvme_rdma_mq_ops;
1812         ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize;
1813         ctrl->tag_set.reserved_tags = 1; /* fabric connect */
1814         ctrl->tag_set.numa_node = NUMA_NO_NODE;
1815         ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1816         ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1817                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1818         ctrl->tag_set.driver_data = ctrl;
1819         ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
1820         ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
1821
1822         ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
1823         if (ret)
1824                 goto out_put_dev;
1825         ctrl->ctrl.tagset = &ctrl->tag_set;
1826
1827         ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
1828         if (IS_ERR(ctrl->ctrl.connect_q)) {
1829                 ret = PTR_ERR(ctrl->ctrl.connect_q);
1830                 goto out_free_tag_set;
1831         }
1832
1833         ret = nvme_rdma_connect_io_queues(ctrl);
1834         if (ret)
1835                 goto out_cleanup_connect_q;
1836
1837         return 0;
1838
1839 out_cleanup_connect_q:
1840         blk_cleanup_queue(ctrl->ctrl.connect_q);
1841 out_free_tag_set:
1842         blk_mq_free_tag_set(&ctrl->tag_set);
1843 out_put_dev:
1844         nvme_rdma_dev_put(ctrl->device);
1845 out_free_io_queues:
1846         nvme_rdma_free_io_queues(ctrl);
1847         return ret;
1848 }
1849
1850 static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
1851 {
1852         u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
1853         size_t buflen = strlen(p);
1854
1855         /* XXX: handle IPv6 addresses */
1856
1857         if (buflen > INET_ADDRSTRLEN)
1858                 return -EINVAL;
1859         if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
1860                 return -EINVAL;
1861         in_addr->sin_family = AF_INET;
1862         return 0;
1863 }
1864
1865 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1866                 struct nvmf_ctrl_options *opts)
1867 {
1868         struct nvme_rdma_ctrl *ctrl;
1869         int ret;
1870         bool changed;
1871
1872         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1873         if (!ctrl)
1874                 return ERR_PTR(-ENOMEM);
1875         ctrl->ctrl.opts = opts;
1876         INIT_LIST_HEAD(&ctrl->list);
1877
1878         ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
1879         if (ret) {
1880                 pr_err("malformed IP address passed: %s\n", opts->traddr);
1881                 goto out_free_ctrl;
1882         }
1883
1884         if (opts->mask & NVMF_OPT_TRSVCID) {
1885                 u16 port;
1886
1887                 ret = kstrtou16(opts->trsvcid, 0, &port);
1888                 if (ret)
1889                         goto out_free_ctrl;
1890
1891                 ctrl->addr_in.sin_port = cpu_to_be16(port);
1892         } else {
1893                 ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
1894         }
1895
1896         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
1897                                 0 /* no quirks, we're perfect! */);
1898         if (ret)
1899                 goto out_free_ctrl;
1900
1901         ctrl->reconnect_delay = opts->reconnect_delay;
1902         INIT_DELAYED_WORK(&ctrl->reconnect_work,
1903                         nvme_rdma_reconnect_ctrl_work);
1904         INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1905         INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
1906         INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work);
1907         spin_lock_init(&ctrl->lock);
1908
1909         ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
1910         ctrl->ctrl.sqsize = opts->queue_size;
1911         ctrl->ctrl.kato = opts->kato;
1912
1913         ret = -ENOMEM;
1914         ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
1915                                 GFP_KERNEL);
1916         if (!ctrl->queues)
1917                 goto out_uninit_ctrl;
1918
1919         ret = nvme_rdma_configure_admin_queue(ctrl);
1920         if (ret)
1921                 goto out_kfree_queues;
1922
1923         /* sanity check icdoff */
1924         if (ctrl->ctrl.icdoff) {
1925                 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1926                 goto out_remove_admin_queue;
1927         }
1928
1929         /* sanity check keyed sgls */
1930         if (!(ctrl->ctrl.sgls & (1 << 20))) {
1931                 dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n");
1932                 goto out_remove_admin_queue;
1933         }
1934
1935         if (opts->queue_size > ctrl->ctrl.maxcmd) {
1936                 /* warn if maxcmd is lower than queue_size */
1937                 dev_warn(ctrl->ctrl.device,
1938                         "queue_size %zu > ctrl maxcmd %u, clamping down\n",
1939                         opts->queue_size, ctrl->ctrl.maxcmd);
1940                 opts->queue_size = ctrl->ctrl.maxcmd;
1941         }
1942
1943         if (opts->nr_io_queues) {
1944                 ret = nvme_rdma_create_io_queues(ctrl);
1945                 if (ret)
1946                         goto out_remove_admin_queue;
1947         }
1948
1949         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1950         WARN_ON_ONCE(!changed);
1951
1952         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
1953                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1954
1955         kref_get(&ctrl->ctrl.kref);
1956
1957         mutex_lock(&nvme_rdma_ctrl_mutex);
1958         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
1959         mutex_unlock(&nvme_rdma_ctrl_mutex);
1960
1961         if (opts->nr_io_queues) {
1962                 nvme_queue_scan(&ctrl->ctrl);
1963                 nvme_queue_async_events(&ctrl->ctrl);
1964         }
1965
1966         return &ctrl->ctrl;
1967
1968 out_remove_admin_queue:
1969         nvme_stop_keep_alive(&ctrl->ctrl);
1970         nvme_rdma_destroy_admin_queue(ctrl);
1971 out_kfree_queues:
1972         kfree(ctrl->queues);
1973 out_uninit_ctrl:
1974         nvme_uninit_ctrl(&ctrl->ctrl);
1975         nvme_put_ctrl(&ctrl->ctrl);
1976         if (ret > 0)
1977                 ret = -EIO;
1978         return ERR_PTR(ret);
1979 out_free_ctrl:
1980         kfree(ctrl);
1981         return ERR_PTR(ret);
1982 }
1983
1984 static struct nvmf_transport_ops nvme_rdma_transport = {
1985         .name           = "rdma",
1986         .required_opts  = NVMF_OPT_TRADDR,
1987         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY,
1988         .create_ctrl    = nvme_rdma_create_ctrl,
1989 };
1990
1991 static int __init nvme_rdma_init_module(void)
1992 {
1993         nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
1994         if (!nvme_rdma_wq)
1995                 return -ENOMEM;
1996
1997         nvmf_register_transport(&nvme_rdma_transport);
1998         return 0;
1999 }
2000
2001 static void __exit nvme_rdma_cleanup_module(void)
2002 {
2003         struct nvme_rdma_ctrl *ctrl;
2004
2005         nvmf_unregister_transport(&nvme_rdma_transport);
2006
2007         mutex_lock(&nvme_rdma_ctrl_mutex);
2008         list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
2009                 __nvme_rdma_del_ctrl(ctrl);
2010         mutex_unlock(&nvme_rdma_ctrl_mutex);
2011
2012         destroy_workqueue(nvme_rdma_wq);
2013 }
2014
2015 module_init(nvme_rdma_init_module);
2016 module_exit(nvme_rdma_cleanup_module);
2017
2018 MODULE_LICENSE("GPL v2");