drivers/staging/rdma/hfi1/user_sdma.c

   1 /*
   2  *
   3  * This file is provided under a dual BSD/GPLv2 license.  When using or
   4  * redistributing this file, you may do so under either license.
   5  *
   6  * GPL LICENSE SUMMARY
   7  *
   8  * Copyright(c) 2015 Intel Corporation.
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of version 2 of the GNU General Public License as
  12  * published by the Free Software Foundation.
  13  *
  14  * This program is distributed in the hope that it will be useful, but
  15  * WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * General Public License for more details.
  18  *
  19  * BSD LICENSE
  20  *
  21  * Copyright(c) 2015 Intel Corporation.
  22  *
  23  * Redistribution and use in source and binary forms, with or without
  24  * modification, are permitted provided that the following conditions
  25  * are met:
  26  *
  27  *  - Redistributions of source code must retain the above copyright
  28  *    notice, this list of conditions and the following disclaimer.
  29  *  - Redistributions in binary form must reproduce the above copyright
  30  *    notice, this list of conditions and the following disclaimer in
  31  *    the documentation and/or other materials provided with the
  32  *    distribution.
  33  *  - Neither the name of Intel Corporation nor the names of its
  34  *    contributors may be used to endorse or promote products derived
  35  *    from this software without specific prior written permission.
  36  *
  37  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  38  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  39  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  40  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  41  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  42  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  43  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  45  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  46  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  47  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48  *
  49  */
  50 #include <linux/mm.h>
  51 #include <linux/types.h>
  52 #include <linux/device.h>
  53 #include <linux/dmapool.h>
  54 #include <linux/slab.h>
  55 #include <linux/list.h>
  56 #include <linux/highmem.h>
  57 #include <linux/io.h>
  58 #include <linux/uio.h>
  59 #include <linux/rbtree.h>
  60 #include <linux/spinlock.h>
  61 #include <linux/delay.h>
  62 #include <linux/kthread.h>
  63 #include <linux/mmu_context.h>
  64 #include <linux/module.h>
  65 #include <linux/vmalloc.h>
  66
  67 #include "hfi.h"
  68 #include "sdma.h"
  69 #include "user_sdma.h"
  70 #include "verbs.h"  /* for the headers */
  71 #include "common.h" /* for struct hfi1_tid_info */
  72 #include "trace.h"
  73
  74 static uint hfi1_sdma_comp_ring_size = 128;
  75 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  76 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  77
  78 /* The maximum number of Data io vectors per message/request */
  79 #define MAX_VECTORS_PER_REQ 8
  80 /*
  81  * Maximum number of packet to send from each message/request
  82  * before moving to the next one.
  83  */
  84 #define MAX_PKTS_PER_QUEUE 16
  85
  86 #define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
  87
  88 #define req_opcode(x) \
  89         (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
  90 #define req_version(x) \
  91         (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
  92 #define req_iovcnt(x) \
  93         (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
  94
  95 /* Number of BTH.PSN bits used for sequence number in expected rcvs */
  96 #define BTH_SEQ_MASK 0x7ffull
  97
  98 /*
  99  * Define fields in the KDETH header so we can update the header
 100  * template.
 101  */
 102 #define KDETH_OFFSET_SHIFT        0
 103 #define KDETH_OFFSET_MASK         0x7fff
 104 #define KDETH_OM_SHIFT            15
 105 #define KDETH_OM_MASK             0x1
 106 #define KDETH_TID_SHIFT           16
 107 #define KDETH_TID_MASK            0x3ff
 108 #define KDETH_TIDCTRL_SHIFT       26
 109 #define KDETH_TIDCTRL_MASK        0x3
 110 #define KDETH_INTR_SHIFT          28
 111 #define KDETH_INTR_MASK           0x1
 112 #define KDETH_SH_SHIFT            29
 113 #define KDETH_SH_MASK             0x1
 114 #define KDETH_HCRC_UPPER_SHIFT    16
 115 #define KDETH_HCRC_UPPER_MASK     0xff
 116 #define KDETH_HCRC_LOWER_SHIFT    24
 117 #define KDETH_HCRC_LOWER_MASK     0xff
 118
 119 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
 120 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
 121
 122 #define KDETH_GET(val, field)                                           \
 123         (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
 124 #define KDETH_SET(dw, field, val) do {                                  \
 125                 u32 dwval = le32_to_cpu(dw);                            \
 126                 dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
 127                 dwval |= (((val) & KDETH_##field##_MASK) << \
 128                           KDETH_##field##_SHIFT);                       \
 129                 dw = cpu_to_le32(dwval);                                \
 130         } while (0)
 131
 132 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value)                 \
 133         do {                                                            \
 134                 if ((idx) < ARRAY_SIZE((arr)))                          \
 135                         (arr)[(idx++)] = sdma_build_ahg_descriptor(     \
 136                                 (__force u16)(value), (dw), (bit),      \
 137                                                         (width));       \
 138                 else                                                    \
 139                         return -ERANGE;                                 \
 140         } while (0)
 141
 142 /* KDETH OM multipliers and switch over point */
 143 #define KDETH_OM_SMALL     4
 144 #define KDETH_OM_LARGE     64
 145 #define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
 146
 147 /* Last packet in the request */
 148 #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
 149 #define TXREQ_FLAGS_IOVEC_LAST_PKT BIT(0)
 150
 151 #define SDMA_REQ_IN_USE     0
 152 #define SDMA_REQ_FOR_THREAD 1
 153 #define SDMA_REQ_SEND_DONE  2
 154 #define SDMA_REQ_HAVE_AHG   3
 155 #define SDMA_REQ_HAS_ERROR  4
 156 #define SDMA_REQ_DONE_ERROR 5
 157
 158 #define SDMA_PKT_Q_INACTIVE BIT(0)
 159 #define SDMA_PKT_Q_ACTIVE   BIT(1)
 160 #define SDMA_PKT_Q_DEFERRED BIT(2)
 161
 162 /*
 163  * Maximum retry attempts to submit a TX request
 164  * before putting the process to sleep.
 165  */
 166 #define MAX_DEFER_RETRY_COUNT 1
 167
 168 static unsigned initial_pkt_count = 8;
 169
 170 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
 171
 172 struct user_sdma_iovec {
 173         struct iovec iov;
 174         /* number of pages in this vector */
 175         unsigned npages;
 176         /* array of pinned pages for this vector */
 177         struct page **pages;
 178         /* offset into the virtual address space of the vector at
 179          * which we last left off. */
 180         u64 offset;
 181 };
 182
 183 struct user_sdma_request {
 184         struct sdma_req_info info;
 185         struct hfi1_user_sdma_pkt_q *pq;
 186         struct hfi1_user_sdma_comp_q *cq;
 187         /* This is the original header from user space */
 188         struct hfi1_pkt_header hdr;
 189         /*
 190          * Pointer to the SDMA engine for this request.
 191          * Since different request could be on different VLs,
 192          * each request will need it's own engine pointer.
 193          */
 194         struct sdma_engine *sde;
 195         u8 ahg_idx;
 196         u32 ahg[9];
 197         /*
 198          * KDETH.Offset (Eager) field
 199          * We need to remember the initial value so the headers
 200          * can be updated properly.
 201          */
 202         u32 koffset;
 203         /*
 204          * KDETH.OFFSET (TID) field
 205          * The offset can cover multiple packets, depending on the
 206          * size of the TID entry.
 207          */
 208         u32 tidoffset;
 209         /*
 210          * KDETH.OM
 211          * Remember this because the header template always sets it
 212          * to 0.
 213          */
 214         u8 omfactor;
 215         /*
 216          * pointer to the user's mm_struct. We are going to
 217          * get a reference to it so it doesn't get freed
 218          * since we might not be in process context when we
 219          * are processing the iov's.
 220          * Using this mm_struct, we can get vma based on the
 221          * iov's address (find_vma()).
 222          */
 223         struct mm_struct *user_mm;
 224         /*
 225          * We copy the iovs for this request (based on
 226          * info.iovcnt). These are only the data vectors
 227          */
 228         unsigned data_iovs;
 229         /* total length of the data in the request */
 230         u32 data_len;
 231         /* progress index moving along the iovs array */
 232         unsigned iov_idx;
 233         struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
 234         /* number of elements copied to the tids array */
 235         u16 n_tids;
 236         /* TID array values copied from the tid_iov vector */
 237         u32 *tids;
 238         u16 tididx;
 239         u32 sent;
 240         u64 seqnum;
 241         struct list_head txps;
 242         spinlock_t txcmp_lock;  /* protect txcmp list */
 243         struct list_head txcmp;
 244         unsigned long flags;
 245         /* status of the last txreq completed */
 246         int status;
 247         struct work_struct worker;
 248 };
 249
 250 /*
 251  * A single txreq could span up to 3 physical pages when the MTU
 252  * is sufficiently large (> 4K). Each of the IOV pointers also
 253  * needs it's own set of flags so the vector has been handled
 254  * independently of each other.
 255  */
 256 struct user_sdma_txreq {
 257         /* Packet header for the txreq */
 258         struct hfi1_pkt_header hdr;
 259         struct sdma_txreq txreq;
 260         struct list_head list;
 261         struct user_sdma_request *req;
 262         struct {
 263                 struct user_sdma_iovec *vec;
 264                 u8 flags;
 265         } iovecs[3];
 266         int idx;
 267         u16 flags;
 268         unsigned busycount;
 269         u64 seqnum;
 270 };
 271
 272 #define SDMA_DBG(req, fmt, ...)                              \
 273         hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
 274                  (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
 275                  ##__VA_ARGS__)
 276 #define SDMA_Q_DBG(pq, fmt, ...)                         \
 277         hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
 278                  (pq)->subctxt, ##__VA_ARGS__)
 279
 280 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
 281 static int num_user_pages(const struct iovec *);
 282 static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
 283 static void user_sdma_delayed_completion(struct work_struct *);
 284 static void user_sdma_free_request(struct user_sdma_request *);
 285 static int pin_vector_pages(struct user_sdma_request *,
 286                             struct user_sdma_iovec *);
 287 static void unpin_vector_pages(struct user_sdma_request *,
 288                                struct user_sdma_iovec *);
 289 static int check_header_template(struct user_sdma_request *,
 290                                  struct hfi1_pkt_header *, u32, u32);
 291 static int set_txreq_header(struct user_sdma_request *,
 292                             struct user_sdma_txreq *, u32);
 293 static int set_txreq_header_ahg(struct user_sdma_request *,
 294                                 struct user_sdma_txreq *, u32);
 295 static inline void set_comp_state(struct user_sdma_request *,
 296                                         enum hfi1_sdma_comp_state, int);
 297 static inline u32 set_pkt_bth_psn(__be32, u8, u32);
 298 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
 299
 300 static int defer_packet_queue(
 301         struct sdma_engine *,
 302         struct iowait *,
 303         struct sdma_txreq *,
 304         unsigned seq);
 305 static void activate_packet_queue(struct iowait *, int);
 306
 307 static int defer_packet_queue(
 308         struct sdma_engine *sde,
 309         struct iowait *wait,
 310         struct sdma_txreq *txreq,
 311         unsigned seq)
 312 {
 313         struct hfi1_user_sdma_pkt_q *pq =
 314                 container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 315         struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
 316         struct user_sdma_txreq *tx =
 317                 container_of(txreq, struct user_sdma_txreq, txreq);
 318
 319         if (sdma_progress(sde, seq, txreq)) {
 320                 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
 321                         goto eagain;
 322         }
 323         /*
 324          * We are assuming that if the list is enqueued somewhere, it
 325          * is to the dmawait list since that is the only place where
 326          * it is supposed to be enqueued.
 327          */
 328         xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 329         write_seqlock(&dev->iowait_lock);
 330         if (list_empty(&pq->busy.list))
 331                 list_add_tail(&pq->busy.list, &sde->dmawait);
 332         write_sequnlock(&dev->iowait_lock);
 333         return -EBUSY;
 334 eagain:
 335         return -EAGAIN;
 336 }
 337
 338 static void activate_packet_queue(struct iowait *wait, int reason)
 339 {
 340         struct hfi1_user_sdma_pkt_q *pq =
 341                 container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 342         xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 343         wake_up(&wait->wait_dma);
 344 };
 345
 346 static void sdma_kmem_cache_ctor(void *obj)
 347 {
 348         struct user_sdma_txreq *tx = obj;
 349
 350         memset(tx, 0, sizeof(*tx));
 351 }
 352
 353 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 354 {
 355         struct hfi1_filedata *fd;
 356         int ret = 0;
 357         unsigned memsize;
 358         char buf[64];
 359         struct hfi1_devdata *dd;
 360         struct hfi1_user_sdma_comp_q *cq;
 361         struct hfi1_user_sdma_pkt_q *pq;
 362         unsigned long flags;
 363
 364         if (!uctxt || !fp) {
 365                 ret = -EBADF;
 366                 goto done;
 367         }
 368
 369         fd = fp->private_data;
 370
 371         if (!hfi1_sdma_comp_ring_size) {
 372                 ret = -EINVAL;
 373                 goto done;
 374         }
 375
 376         dd = uctxt->dd;
 377
 378         pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 379         if (!pq)
 380                 goto pq_nomem;
 381
 382         memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
 383         pq->reqs = kmalloc(memsize, GFP_KERNEL);
 384         if (!pq->reqs)
 385                 goto pq_reqs_nomem;
 386
 387         INIT_LIST_HEAD(&pq->list);
 388         pq->dd = dd;
 389         pq->ctxt = uctxt->ctxt;
 390         pq->subctxt = fd->subctxt;
 391         pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 392         pq->state = SDMA_PKT_Q_INACTIVE;
 393         atomic_set(&pq->n_reqs, 0);
 394         init_waitqueue_head(&pq->wait);
 395
 396         iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
 397                     activate_packet_queue);
 398         pq->reqidx = 0;
 399         snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 400                  fd->subctxt);
 401         pq->txreq_cache = kmem_cache_create(buf,
 402                                sizeof(struct user_sdma_txreq),
 403                                             L1_CACHE_BYTES,
 404                                             SLAB_HWCACHE_ALIGN,
 405                                             sdma_kmem_cache_ctor);
 406         if (!pq->txreq_cache) {
 407                 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 408                            uctxt->ctxt);
 409                 goto pq_txreq_nomem;
 410         }
 411         fd->pq = pq;
 412         cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 413         if (!cq)
 414                 goto cq_nomem;
 415
 416         memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
 417         cq->comps = vmalloc_user(memsize);
 418         if (!cq->comps)
 419                 goto cq_comps_nomem;
 420
 421         cq->nentries = hfi1_sdma_comp_ring_size;
 422         fd->cq = cq;
 423
 424         spin_lock_irqsave(&uctxt->sdma_qlock, flags);
 425         list_add(&pq->list, &uctxt->sdma_queues);
 426         spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
 427         goto done;
 428
 429 cq_comps_nomem:
 430         kfree(cq);
 431 cq_nomem:
 432         kmem_cache_destroy(pq->txreq_cache);
 433 pq_txreq_nomem:
 434         kfree(pq->reqs);
 435 pq_reqs_nomem:
 436         kfree(pq);
 437         fd->pq = NULL;
 438 pq_nomem:
 439         ret = -ENOMEM;
 440 done:
 441         return ret;
 442 }
 443
 444 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
 445 {
 446         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 447         struct hfi1_user_sdma_pkt_q *pq;
 448         unsigned long flags;
 449
 450         hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
 451                   uctxt->ctxt, fd->subctxt);
 452         pq = fd->pq;
 453         if (pq) {
 454                 spin_lock_irqsave(&uctxt->sdma_qlock, flags);
 455                 if (!list_empty(&pq->list))
 456                         list_del_init(&pq->list);
 457                 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
 458                 iowait_sdma_drain(&pq->busy);
 459                 /* Wait until all requests have been freed. */
 460                 wait_event_interruptible(
 461                         pq->wait,
 462                         (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
 463                 kfree(pq->reqs);
 464                 kmem_cache_destroy(pq->txreq_cache);
 465                 kfree(pq);
 466                 fd->pq = NULL;
 467         }
 468         if (fd->cq) {
 469                 vfree(fd->cq->comps);
 470                 kfree(fd->cq);
 471                 fd->cq = NULL;
 472         }
 473         return 0;
 474 }
 475
 476 int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 477                                    unsigned long dim, unsigned long *count)
 478 {
 479         int ret = 0, i = 0, sent;
 480         struct hfi1_filedata *fd = fp->private_data;
 481         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 482         struct hfi1_user_sdma_pkt_q *pq = fd->pq;
 483         struct hfi1_user_sdma_comp_q *cq = fd->cq;
 484         struct hfi1_devdata *dd = pq->dd;
 485         unsigned long idx = 0;
 486         u8 pcount = initial_pkt_count;
 487         struct sdma_req_info info;
 488         struct user_sdma_request *req;
 489         u8 opcode, sc, vl;
 490
 491         if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 492                 hfi1_cdbg(
 493                    SDMA,
 494                    "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 495                    dd->unit, uctxt->ctxt, fd->subctxt,
 496                    iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 497                 return -EINVAL;
 498         }
 499         ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 500         if (ret) {
 501                 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 502                           dd->unit, uctxt->ctxt, fd->subctxt, ret);
 503                 return -EFAULT;
 504         }
 505         trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 506                                      (u16 *)&info);
 507         if (cq->comps[info.comp_idx].status == QUEUED) {
 508                 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
 509                           dd->unit, uctxt->ctxt, fd->subctxt,
 510                           info.comp_idx);
 511                 return -EBADSLT;
 512         }
 513         if (!info.fragsize) {
 514                 hfi1_cdbg(SDMA,
 515                           "[%u:%u:%u:%u] Request does not specify fragsize",
 516                           dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 517                 return -EINVAL;
 518         }
 519         /*
 520          * We've done all the safety checks that we can up to this point,
 521          * "allocate" the request entry.
 522          */
 523         hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
 524                   uctxt->ctxt, fd->subctxt, info.comp_idx);
 525         req = pq->reqs + info.comp_idx;
 526         memset(req, 0, sizeof(*req));
 527         /* Mark the request as IN_USE before we start filling it in. */
 528         set_bit(SDMA_REQ_IN_USE, &req->flags);
 529         req->data_iovs = req_iovcnt(info.ctrl) - 1;
 530         req->pq = pq;
 531         req->cq = cq;
 532         req->status = -1;
 533         INIT_LIST_HEAD(&req->txps);
 534         INIT_LIST_HEAD(&req->txcmp);
 535         INIT_WORK(&req->worker, user_sdma_delayed_completion);
 536
 537         spin_lock_init(&req->txcmp_lock);
 538         memcpy(&req->info, &info, sizeof(info));
 539
 540         if (req_opcode(info.ctrl) == EXPECTED)
 541                 req->data_iovs--;
 542
 543         if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 544                 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 545                          MAX_VECTORS_PER_REQ);
 546                 return -EINVAL;
 547         }
 548         /* Copy the header from the user buffer */
 549         ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 550                              sizeof(req->hdr));
 551         if (ret) {
 552                 SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 553                 ret = -EFAULT;
 554                 goto free_req;
 555         }
 556
 557         /* If Static rate control is not enabled, sanitize the header. */
 558         if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 559                 req->hdr.pbc[2] = 0;
 560
 561         /* Validate the opcode. Do not trust packets from user space blindly. */
 562         opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 563         if ((opcode & USER_OPCODE_CHECK_MASK) !=
 564              USER_OPCODE_CHECK_VAL) {
 565                 SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 566                 ret = -EINVAL;
 567                 goto free_req;
 568         }
 569         /*
 570          * Validate the vl. Do not trust packets from user space blindly.
 571          * VL comes from PBC, SC comes from LRH, and the VL needs to
 572          * match the SC look up.
 573          */
 574         vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 575         sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 576               (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 577         if (vl >= dd->pport->vls_operational ||
 578             vl != sc_to_vlt(dd, sc)) {
 579                 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 580                 ret = -EINVAL;
 581                 goto free_req;
 582         }
 583
 584         /*
 585          * Also should check the BTH.lnh. If it says the next header is GRH then
 586          * the RXE parsing will be off and will land in the middle of the KDETH
 587          * or miss it entirely.
 588          */
 589         if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 590                 SDMA_DBG(req, "User tried to pass in a GRH");
 591                 ret = -EINVAL;
 592                 goto free_req;
 593         }
 594
 595         req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 596         /* Calculate the initial TID offset based on the values of
 597            KDETH.OFFSET and KDETH.OM that are passed in. */
 598         req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 599                 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 600                  KDETH_OM_LARGE : KDETH_OM_SMALL);
 601         SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
 602         idx++;
 603
 604         /* Save all the IO vector structures */
 605         while (i < req->data_iovs) {
 606                 memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
 607                 req->iovs[i].offset = 0;
 608                 req->data_len += req->iovs[i++].iov.iov_len;
 609         }
 610         SDMA_DBG(req, "total data length %u", req->data_len);
 611
 612         if (pcount > req->info.npkts)
 613                 pcount = req->info.npkts;
 614         /*
 615          * Copy any TID info
 616          * User space will provide the TID info only when the
 617          * request type is EXPECTED. This is true even if there is
 618          * only one packet in the request and the header is already
 619          * setup. The reason for the singular TID case is that the
 620          * driver needs to perform safety checks.
 621          */
 622         if (req_opcode(req->info.ctrl) == EXPECTED) {
 623                 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 624
 625                 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 626                         ret = -EINVAL;
 627                         goto free_req;
 628                 }
 629                 req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
 630                 if (!req->tids) {
 631                         ret = -ENOMEM;
 632                         goto free_req;
 633                 }
 634                 /*
 635                  * We have to copy all of the tids because they may vary
 636                  * in size and, therefore, the TID count might not be
 637                  * equal to the pkt count. However, there is no way to
 638                  * tell at this point.
 639                  */
 640                 ret = copy_from_user(req->tids, iovec[idx].iov_base,
 641                                      ntids * sizeof(*req->tids));
 642                 if (ret) {
 643                         SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 644                                  ntids, ret);
 645                         ret = -EFAULT;
 646                         goto free_req;
 647                 }
 648                 req->n_tids = ntids;
 649                 idx++;
 650         }
 651
 652         /* Have to select the engine */
 653         req->sde = sdma_select_engine_vl(dd,
 654                                          (u32)(uctxt->ctxt + fd->subctxt),
 655                                          vl);
 656         if (!req->sde || !sdma_running(req->sde)) {
 657                 ret = -ECOMM;
 658                 goto free_req;
 659         }
 660
 661         /* We don't need an AHG entry if the request contains only one packet */
 662         if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
 663                 int ahg = sdma_ahg_alloc(req->sde);
 664
 665                 if (likely(ahg >= 0)) {
 666                         req->ahg_idx = (u8)ahg;
 667                         set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
 668                 }
 669         }
 670
 671         set_comp_state(req, QUEUED, 0);
 672         /* Send the first N packets in the request to buy us some time */
 673         sent = user_sdma_send_pkts(req, pcount);
 674         if (unlikely(sent < 0)) {
 675                 if (sent != -EBUSY) {
 676                         req->status = sent;
 677                         set_comp_state(req, ERROR, req->status);
 678                         return sent;
 679                 } else
 680                         sent = 0;
 681         }
 682         atomic_inc(&pq->n_reqs);
 683         xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 684
 685         if (sent < req->info.npkts) {
 686                 /*
 687                  * This is a somewhat blocking send implementation.
 688                  * The driver will block the caller until all packets of the
 689                  * request have been submitted to the SDMA engine. However, it
 690                  * will not wait for send completions.
 691                  */
 692                 while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
 693                         ret = user_sdma_send_pkts(req, pcount);
 694                         if (ret < 0) {
 695                                 if (ret != -EBUSY) {
 696                                         req->status = ret;
 697                                         return ret;
 698                                 }
 699                                 wait_event_interruptible_timeout(
 700                                         pq->busy.wait_dma,
 701                                         (pq->state == SDMA_PKT_Q_ACTIVE),
 702                                         msecs_to_jiffies(
 703                                                 SDMA_IOWAIT_TIMEOUT));
 704                         }
 705                 }
 706
 707         }
 708         *count += idx;
 709         return 0;
 710 free_req:
 711         user_sdma_free_request(req);
 712         return ret;
 713 }
 714
 715 static inline u32 compute_data_length(struct user_sdma_request *req,
 716                                             struct user_sdma_txreq *tx)
 717 {
 718         /*
 719          * Determine the proper size of the packet data.
 720          * The size of the data of the first packet is in the header
 721          * template. However, it includes the header and ICRC, which need
 722          * to be subtracted.
 723          * The size of the remaining packets is the minimum of the frag
 724          * size (MTU) or remaining data in the request.
 725          */
 726         u32 len;
 727
 728         if (!req->seqnum) {
 729                 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 730                        (sizeof(tx->hdr) - 4));
 731         } else if (req_opcode(req->info.ctrl) == EXPECTED) {
 732                 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 733                         PAGE_SIZE;
 734                 /* Get the data length based on the remaining space in the
 735                  * TID pair. */
 736                 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 737                 /* If we've filled up the TID pair, move to the next one. */
 738                 if (unlikely(!len) && ++req->tididx < req->n_tids &&
 739                     req->tids[req->tididx]) {
 740                         tidlen = EXP_TID_GET(req->tids[req->tididx],
 741                                              LEN) * PAGE_SIZE;
 742                         req->tidoffset = 0;
 743                         len = min_t(u32, tidlen, req->info.fragsize);
 744                 }
 745                 /* Since the TID pairs map entire pages, make sure that we
 746                  * are not going to try to send more data that we have
 747                  * remaining. */
 748                 len = min(len, req->data_len - req->sent);
 749         } else
 750                 len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 751         SDMA_DBG(req, "Data Length = %u", len);
 752         return len;
 753 }
 754
 755 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 756 {
 757         /* (Size of complete header - size of PBC) + 4B ICRC + data length */
 758         return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 759 }
 760
 761 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 762 {
 763         int ret = 0;
 764         unsigned npkts = 0;
 765         struct user_sdma_txreq *tx = NULL;
 766         struct hfi1_user_sdma_pkt_q *pq = NULL;
 767         struct user_sdma_iovec *iovec = NULL;
 768
 769         if (!req->pq)
 770                 return -EINVAL;
 771
 772         pq = req->pq;
 773
 774         /* If tx completion has reported an error, we are done. */
 775         if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
 776                 set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
 777                 return -EFAULT;
 778         }
 779
 780         /*
 781          * Check if we might have sent the entire request already
 782          */
 783         if (unlikely(req->seqnum == req->info.npkts)) {
 784                 if (!list_empty(&req->txps))
 785                         goto dosend;
 786                 return ret;
 787         }
 788
 789         if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 790                 maxpkts = req->info.npkts - req->seqnum;
 791
 792         while (npkts < maxpkts) {
 793                 u32 datalen = 0, queued = 0, data_sent = 0;
 794                 u64 iov_offset = 0;
 795
 796                 /*
 797                  * Check whether any of the completions have come back
 798                  * with errors. If so, we are not going to process any
 799                  * more packets from this request.
 800                  */
 801                 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
 802                         set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
 803                         return -EFAULT;
 804                 }
 805
 806                 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 807                 if (!tx)
 808                         return -ENOMEM;
 809
 810                 tx->flags = 0;
 811                 tx->req = req;
 812                 tx->busycount = 0;
 813                 tx->idx = -1;
 814                 INIT_LIST_HEAD(&tx->list);
 815                 memset(tx->iovecs, 0, sizeof(tx->iovecs));
 816
 817                 if (req->seqnum == req->info.npkts - 1)
 818                         tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
 819
 820                 /*
 821                  * Calculate the payload size - this is min of the fragment
 822                  * (MTU) size or the remaining bytes in the request but only
 823                  * if we have payload data.
 824                  */
 825                 if (req->data_len) {
 826                         iovec = &req->iovs[req->iov_idx];
 827                         if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
 828                                 if (++req->iov_idx == req->data_iovs) {
 829                                         ret = -EFAULT;
 830                                         goto free_txreq;
 831                                 }
 832                                 iovec = &req->iovs[req->iov_idx];
 833                                 WARN_ON(iovec->offset);
 834                         }
 835
 836                         /*
 837                          * This request might include only a header and no user
 838                          * data, so pin pages only if there is data and it the
 839                          * pages have not been pinned already.
 840                          */
 841                         if (unlikely(!iovec->pages && iovec->iov.iov_len)) {
 842                                 ret = pin_vector_pages(req, iovec);
 843                                 if (ret)
 844                                         goto free_tx;
 845                         }
 846
 847                         tx->iovecs[++tx->idx].vec = iovec;
 848                         datalen = compute_data_length(req, tx);
 849                         if (!datalen) {
 850                                 SDMA_DBG(req,
 851                                          "Request has data but pkt len is 0");
 852                                 ret = -EFAULT;
 853                                 goto free_tx;
 854                         }
 855                 }
 856
 857                 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
 858                         if (!req->seqnum) {
 859                                 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 860                                 u32 lrhlen = get_lrh_len(req->hdr, datalen);
 861                                 /*
 862                                  * Copy the request header into the tx header
 863                                  * because the HW needs a cacheline-aligned
 864                                  * address.
 865                                  * This copy can be optimized out if the hdr
 866                                  * member of user_sdma_request were also
 867                                  * cacheline aligned.
 868                                  */
 869                                 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 870                                 if (PBC2LRH(pbclen) != lrhlen) {
 871                                         pbclen = (pbclen & 0xf000) |
 872                                                 LRH2PBC(lrhlen);
 873                                         tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 874                                 }
 875                                 ret = sdma_txinit_ahg(&tx->txreq,
 876                                                       SDMA_TXREQ_F_AHG_COPY,
 877                                                       sizeof(tx->hdr) + datalen,
 878                                                       req->ahg_idx, 0, NULL, 0,
 879                                                       user_sdma_txreq_cb);
 880                                 if (ret)
 881                                         goto free_tx;
 882                                 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
 883                                                         &tx->hdr,
 884                                                         sizeof(tx->hdr));
 885                                 if (ret)
 886                                         goto free_txreq;
 887                         } else {
 888                                 int changes;
 889
 890                                 changes = set_txreq_header_ahg(req, tx,
 891                                                                datalen);
 892                                 if (changes < 0)
 893                                         goto free_tx;
 894                                 sdma_txinit_ahg(&tx->txreq,
 895                                                 SDMA_TXREQ_F_USE_AHG,
 896                                                 datalen, req->ahg_idx, changes,
 897                                                 req->ahg, sizeof(req->hdr),
 898                                                 user_sdma_txreq_cb);
 899                         }
 900                 } else {
 901                         ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 902                                           datalen, user_sdma_txreq_cb);
 903                         if (ret)
 904                                 goto free_tx;
 905                         /*
 906                          * Modify the header for this packet. This only needs
 907                          * to be done if we are not going to use AHG. Otherwise,
 908                          * the HW will do it based on the changes we gave it
 909                          * during sdma_txinit_ahg().
 910                          */
 911                         ret = set_txreq_header(req, tx, datalen);
 912                         if (ret)
 913                                 goto free_txreq;
 914                 }
 915
 916                 /*
 917                  * If the request contains any data vectors, add up to
 918                  * fragsize bytes to the descriptor.
 919                  */
 920                 while (queued < datalen &&
 921                        (req->sent + data_sent) < req->data_len) {
 922                         unsigned long base, offset;
 923                         unsigned pageidx, len;
 924
 925                         base = (unsigned long)iovec->iov.iov_base;
 926                         offset = offset_in_page(base + iovec->offset +
 927                                                 iov_offset);
 928                         pageidx = (((iovec->offset + iov_offset +
 929                                      base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
 930                         len = offset + req->info.fragsize > PAGE_SIZE ?
 931                                 PAGE_SIZE - offset : req->info.fragsize;
 932                         len = min((datalen - queued), len);
 933                         ret = sdma_txadd_page(pq->dd, &tx->txreq,
 934                                               iovec->pages[pageidx],
 935                                               offset, len);
 936                         if (ret) {
 937                                 int i;
 938
 939                                 SDMA_DBG(req, "SDMA txreq add page failed %d\n",
 940                                          ret);
 941                                 /* Mark all assigned vectors as complete so they
 942                                  * are unpinned in the callback. */
 943                                 for (i = tx->idx; i >= 0; i--) {
 944                                         tx->iovecs[i].flags |=
 945                                                 TXREQ_FLAGS_IOVEC_LAST_PKT;
 946                                 }
 947                                 goto free_txreq;
 948                         }
 949                         iov_offset += len;
 950                         queued += len;
 951                         data_sent += len;
 952                         if (unlikely(queued < datalen &&
 953                                      pageidx == iovec->npages &&
 954                                      req->iov_idx < req->data_iovs - 1 &&
 955                                      tx->idx < ARRAY_SIZE(tx->iovecs))) {
 956                                 iovec->offset += iov_offset;
 957                                 tx->iovecs[tx->idx].flags |=
 958                                         TXREQ_FLAGS_IOVEC_LAST_PKT;
 959                                 iovec = &req->iovs[++req->iov_idx];
 960                                 if (!iovec->pages) {
 961                                         ret = pin_vector_pages(req, iovec);
 962                                         if (ret)
 963                                                 goto free_txreq;
 964                                 }
 965                                 iov_offset = 0;
 966                                 tx->iovecs[++tx->idx].vec = iovec;
 967                         }
 968                 }
 969                 /*
 970                  * The txreq was submitted successfully so we can update
 971                  * the counters.
 972                  */
 973                 req->koffset += datalen;
 974                 if (req_opcode(req->info.ctrl) == EXPECTED)
 975                         req->tidoffset += datalen;
 976                 req->sent += data_sent;
 977                 if (req->data_len) {
 978                         tx->iovecs[tx->idx].vec->offset += iov_offset;
 979                         /* If we've reached the end of the io vector, mark it
 980                          * so the callback can unpin the pages and free it. */
 981                         if (tx->iovecs[tx->idx].vec->offset ==
 982                             tx->iovecs[tx->idx].vec->iov.iov_len)
 983                                 tx->iovecs[tx->idx].flags |=
 984                                         TXREQ_FLAGS_IOVEC_LAST_PKT;
 985                 }
 986
 987                 /*
 988                  * It is important to increment this here as it is used to
 989                  * generate the BTH.PSN and, therefore, can't be bulk-updated
 990                  * outside of the loop.
 991                  */
 992                 tx->seqnum = req->seqnum++;
 993                 list_add_tail(&tx->txreq.list, &req->txps);
 994                 npkts++;
 995         }
 996 dosend:
 997         ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
 998         if (list_empty(&req->txps))
 999                 if (req->seqnum == req->info.npkts) {
1000                         set_bit(SDMA_REQ_SEND_DONE, &req->flags);
1001                         /*
1002                          * The txreq has already been submitted to the HW queue
1003                          * so we can free the AHG entry now. Corruption will not
1004                          * happen due to the sequential manner in which
1005                          * descriptors are processed.
1006                          */
1007                         if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
1008                                 sdma_ahg_free(req->sde, req->ahg_idx);
1009                 }
1010         return ret;
1011
1012 free_txreq:
1013         sdma_txclean(pq->dd, &tx->txreq);
1014 free_tx:
1015         kmem_cache_free(pq->txreq_cache, tx);
1016         return ret;
1017 }
1018
1019 /*
1020  * How many pages in this iovec element?
1021  */
1022 static inline int num_user_pages(const struct iovec *iov)
1023 {
1024         const unsigned long addr  = (unsigned long) iov->iov_base;
1025         const unsigned long len   = iov->iov_len;
1026         const unsigned long spage = addr & PAGE_MASK;
1027         const unsigned long epage = (addr + len - 1) & PAGE_MASK;
1028
1029         return 1 + ((epage - spage) >> PAGE_SHIFT);
1030 }
1031
1032 static int pin_vector_pages(struct user_sdma_request *req,
1033                             struct user_sdma_iovec *iovec) {
1034         int pinned, npages;
1035
1036         npages = num_user_pages(&iovec->iov);
1037         iovec->pages = kcalloc(npages, sizeof(*iovec->pages), GFP_KERNEL);
1038         if (!iovec->pages) {
1039                 SDMA_DBG(req, "Failed page array alloc");
1040                 return -ENOMEM;
1041         }
1042
1043         /*
1044          * Get a reference to the process's mm so we can use it when
1045          * unpinning the io vectors.
1046          */
1047         req->pq->user_mm = get_task_mm(current);
1048
1049         pinned = hfi1_acquire_user_pages((unsigned long)iovec->iov.iov_base,
1050                                          npages, 0, iovec->pages);
1051
1052         if (pinned < 0)
1053                 return pinned;
1054
1055         iovec->npages = pinned;
1056         if (pinned != npages) {
1057                 SDMA_DBG(req, "Failed to pin pages (%d/%u)", pinned, npages);
1058                 unpin_vector_pages(req, iovec);
1059                 return -EFAULT;
1060         }
1061         return 0;
1062 }
1063
1064 static void unpin_vector_pages(struct user_sdma_request *req,
1065                                struct user_sdma_iovec *iovec)
1066 {
1067         /*
1068          * Unpinning is done through the workqueue so use the
1069          * process's mm if we have a reference to it.
1070          */
1071         if ((current->flags & PF_KTHREAD) && req->pq->user_mm)
1072                 use_mm(req->pq->user_mm);
1073
1074         hfi1_release_user_pages(iovec->pages, iovec->npages, 0);
1075
1076         /*
1077          * Unuse the user's mm (see above) and release the
1078          * reference to it.
1079          */
1080         if (req->pq->user_mm) {
1081                 if (current->flags & PF_KTHREAD)
1082                         unuse_mm(req->pq->user_mm);
1083                 mmput(req->pq->user_mm);
1084         }
1085
1086         kfree(iovec->pages);
1087         iovec->pages = NULL;
1088         iovec->npages = 0;
1089         iovec->offset = 0;
1090 }
1091
1092 static int check_header_template(struct user_sdma_request *req,
1093                                  struct hfi1_pkt_header *hdr, u32 lrhlen,
1094                                  u32 datalen)
1095 {
1096         /*
1097          * Perform safety checks for any type of packet:
1098          *    - transfer size is multiple of 64bytes
1099          *    - packet length is multiple of 4bytes
1100          *    - entire request length is multiple of 4bytes
1101          *    - packet length is not larger than MTU size
1102          *
1103          * These checks are only done for the first packet of the
1104          * transfer since the header is "given" to us by user space.
1105          * For the remainder of the packets we compute the values.
1106          */
1107         if (req->info.fragsize % PIO_BLOCK_SIZE ||
1108             lrhlen & 0x3 || req->data_len & 0x3  ||
1109             lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1110                 return -EINVAL;
1111
1112         if (req_opcode(req->info.ctrl) == EXPECTED) {
1113                 /*
1114                  * The header is checked only on the first packet. Furthermore,
1115                  * we ensure that at least one TID entry is copied when the
1116                  * request is submitted. Therefore, we don't have to verify that
1117                  * tididx points to something sane.
1118                  */
1119                 u32 tidval = req->tids[req->tididx],
1120                         tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1121                         tididx = EXP_TID_GET(tidval, IDX),
1122                         tidctrl = EXP_TID_GET(tidval, CTRL),
1123                         tidoff;
1124                 __le32 kval = hdr->kdeth.ver_tid_offset;
1125
1126                 tidoff = KDETH_GET(kval, OFFSET) *
1127                           (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1128                            KDETH_OM_LARGE : KDETH_OM_SMALL);
1129                 /*
1130                  * Expected receive packets have the following
1131                  * additional checks:
1132                  *     - offset is not larger than the TID size
1133                  *     - TIDCtrl values match between header and TID array
1134                  *     - TID indexes match between header and TID array
1135                  */
1136                 if ((tidoff + datalen > tidlen) ||
1137                     KDETH_GET(kval, TIDCTRL) != tidctrl ||
1138                     KDETH_GET(kval, TID) != tididx)
1139                         return -EINVAL;
1140         }
1141         return 0;
1142 }
1143
1144 /*
1145  * Correctly set the BTH.PSN field based on type of
1146  * transfer - eager packets can just increment the PSN but
1147  * expected packets encode generation and sequence in the
1148  * BTH.PSN field so just incrementing will result in errors.
1149  */
1150 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1151 {
1152         u32 val = be32_to_cpu(bthpsn),
1153                 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1154                         0xffffffull),
1155                 psn = val & mask;
1156         if (expct)
1157                 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
1158         else
1159                 psn = psn + frags;
1160         return psn & mask;
1161 }
1162
1163 static int set_txreq_header(struct user_sdma_request *req,
1164                             struct user_sdma_txreq *tx, u32 datalen)
1165 {
1166         struct hfi1_user_sdma_pkt_q *pq = req->pq;
1167         struct hfi1_pkt_header *hdr = &tx->hdr;
1168         u16 pbclen;
1169         int ret;
1170         u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
1171
1172         /* Copy the header template to the request before modification */
1173         memcpy(hdr, &req->hdr, sizeof(*hdr));
1174
1175         /*
1176          * Check if the PBC and LRH length are mismatched. If so
1177          * adjust both in the header.
1178          */
1179         pbclen = le16_to_cpu(hdr->pbc[0]);
1180         if (PBC2LRH(pbclen) != lrhlen) {
1181                 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1182                 hdr->pbc[0] = cpu_to_le16(pbclen);
1183                 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1184                 /*
1185                  * Third packet
1186                  * This is the first packet in the sequence that has
1187                  * a "static" size that can be used for the rest of
1188                  * the packets (besides the last one).
1189                  */
1190                 if (unlikely(req->seqnum == 2)) {
1191                         /*
1192                          * From this point on the lengths in both the
1193                          * PBC and LRH are the same until the last
1194                          * packet.
1195                          * Adjust the template so we don't have to update
1196                          * every packet
1197                          */
1198                         req->hdr.pbc[0] = hdr->pbc[0];
1199                         req->hdr.lrh[2] = hdr->lrh[2];
1200                 }
1201         }
1202         /*
1203          * We only have to modify the header if this is not the
1204          * first packet in the request. Otherwise, we use the
1205          * header given to us.
1206          */
1207         if (unlikely(!req->seqnum)) {
1208                 ret = check_header_template(req, hdr, lrhlen, datalen);
1209                 if (ret)
1210                         return ret;
1211                 goto done;
1212
1213         }
1214
1215         hdr->bth[2] = cpu_to_be32(
1216                 set_pkt_bth_psn(hdr->bth[2],
1217                                 (req_opcode(req->info.ctrl) == EXPECTED),
1218                                 req->seqnum));
1219
1220         /* Set ACK request on last packet */
1221         if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
1222                 hdr->bth[2] |= cpu_to_be32(1UL<<31);
1223
1224         /* Set the new offset */
1225         hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1226         /* Expected packets have to fill in the new TID information */
1227         if (req_opcode(req->info.ctrl) == EXPECTED) {
1228                 tidval = req->tids[req->tididx];
1229                 /*
1230                  * If the offset puts us at the end of the current TID,
1231                  * advance everything.
1232                  */
1233                 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1234                                          PAGE_SIZE)) {
1235                         req->tidoffset = 0;
1236                         /* Since we don't copy all the TIDs, all at once,
1237                          * we have to check again. */
1238                         if (++req->tididx > req->n_tids - 1 ||
1239                             !req->tids[req->tididx]) {
1240                                 return -EINVAL;
1241                         }
1242                         tidval = req->tids[req->tididx];
1243                 }
1244                 req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1245                         KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
1246                 /* Set KDETH.TIDCtrl based on value for this TID. */
1247                 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1248                           EXP_TID_GET(tidval, CTRL));
1249                 /* Set KDETH.TID based on value for this TID */
1250                 KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1251                           EXP_TID_GET(tidval, IDX));
1252                 /* Clear KDETH.SH only on the last packet */
1253                 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
1254                         KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1255                 /*
1256                  * Set the KDETH.OFFSET and KDETH.OM based on size of
1257                  * transfer.
1258                  */
1259                 SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
1260                          req->tidoffset, req->tidoffset / req->omfactor,
1261                          !!(req->omfactor - KDETH_OM_SMALL));
1262                 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1263                           req->tidoffset / req->omfactor);
1264                 KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1265                           !!(req->omfactor - KDETH_OM_SMALL));
1266         }
1267 done:
1268         trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1269                                     req->info.comp_idx, hdr, tidval);
1270         return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1271 }
1272
1273 static int set_txreq_header_ahg(struct user_sdma_request *req,
1274                                 struct user_sdma_txreq *tx, u32 len)
1275 {
1276         int diff = 0;
1277         struct hfi1_user_sdma_pkt_q *pq = req->pq;
1278         struct hfi1_pkt_header *hdr = &req->hdr;
1279         u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1280         u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
1281
1282         if (PBC2LRH(pbclen) != lrhlen) {
1283                 /* PBC.PbcLengthDWs */
1284                 AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
1285                                cpu_to_le16(LRH2PBC(lrhlen)));
1286                 /* LRH.PktLen (we need the full 16 bits due to byte swap) */
1287                 AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
1288                                cpu_to_be16(lrhlen >> 2));
1289         }
1290
1291         /*
1292          * Do the common updates
1293          */
1294         /* BTH.PSN and BTH.A */
1295         val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1296                 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1297         if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
1298                 val32 |= 1UL << 31;
1299         AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
1300         AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
1301         /* KDETH.Offset */
1302         AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
1303                        cpu_to_le16(req->koffset & 0xffff));
1304         AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
1305                        cpu_to_le16(req->koffset >> 16));
1306         if (req_opcode(req->info.ctrl) == EXPECTED) {
1307                 __le16 val;
1308
1309                 tidval = req->tids[req->tididx];
1310
1311                 /*
1312                  * If the offset puts us at the end of the current TID,
1313                  * advance everything.
1314                  */
1315                 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1316                                          PAGE_SIZE)) {
1317                         req->tidoffset = 0;
1318                         /* Since we don't copy all the TIDs, all at once,
1319                          * we have to check again. */
1320                         if (++req->tididx > req->n_tids - 1 ||
1321                             !req->tids[req->tididx]) {
1322                                 return -EINVAL;
1323                         }
1324                         tidval = req->tids[req->tididx];
1325                 }
1326                 req->omfactor = ((EXP_TID_GET(tidval, LEN) *
1327                                   PAGE_SIZE) >=
1328                                  KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
1329                         KDETH_OM_SMALL;
1330                 /* KDETH.OM and KDETH.OFFSET (TID) */
1331                 AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
1332                                ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
1333                                 ((req->tidoffset / req->omfactor) & 0x7fff)));
1334                 /* KDETH.TIDCtrl, KDETH.TID */
1335                 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1336                                         (EXP_TID_GET(tidval, IDX) & 0x3ff));
1337                 /* Clear KDETH.SH on last packet */
1338                 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
1339                         val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
1340                                                                 INTR) >> 16);
1341                         val &= cpu_to_le16(~(1U << 13));
1342                         AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
1343                 } else
1344                         AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
1345         }
1346
1347         trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1348                                         req->info.comp_idx, req->sde->this_idx,
1349                                         req->ahg_idx, req->ahg, diff, tidval);
1350         return diff;
1351 }
1352
1353 /*
1354  * SDMA tx request completion callback. Called when the SDMA progress
1355  * state machine gets notification that the SDMA descriptors for this
1356  * tx request have been processed by the DMA engine. Called in
1357  * interrupt context.
1358  */
1359 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
1360                                int drain)
1361 {
1362         struct user_sdma_txreq *tx =
1363                 container_of(txreq, struct user_sdma_txreq, txreq);
1364         struct user_sdma_request *req;
1365         bool defer;
1366         int i;
1367
1368         if (!tx->req)
1369                 return;
1370
1371         req = tx->req;
1372         /*
1373          * If this is the callback for the last packet of the request,
1374          * queue up the request for clean up.
1375          */
1376         defer = (tx->seqnum == req->info.npkts - 1);
1377
1378         /*
1379          * If we have any io vectors associated with this txreq,
1380          * check whether they need to be 'freed'. We can't free them
1381          * here because the unpin function needs to be able to sleep.
1382          */
1383         for (i = tx->idx; i >= 0; i--) {
1384                 if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT) {
1385                         defer = true;
1386                         break;
1387                 }
1388         }
1389
1390         req->status = status;
1391         if (status != SDMA_TXREQ_S_OK) {
1392                 SDMA_DBG(req, "SDMA completion with error %d",
1393                          status);
1394                 set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
1395                 defer = true;
1396         }
1397
1398         /*
1399          * Defer the clean up of the iovectors and the request until later
1400          * so it can be done outside of interrupt context.
1401          */
1402         if (defer) {
1403                 spin_lock(&req->txcmp_lock);
1404                 list_add_tail(&tx->list, &req->txcmp);
1405                 spin_unlock(&req->txcmp_lock);
1406                 schedule_work(&req->worker);
1407         } else {
1408                 kmem_cache_free(req->pq->txreq_cache, tx);
1409         }
1410 }
1411
1412 static void user_sdma_delayed_completion(struct work_struct *work)
1413 {
1414         struct user_sdma_request *req =
1415                 container_of(work, struct user_sdma_request, worker);
1416         struct hfi1_user_sdma_pkt_q *pq = req->pq;
1417         struct user_sdma_txreq *tx = NULL;
1418         unsigned long flags;
1419         u64 seqnum;
1420         int i;
1421
1422         while (1) {
1423                 spin_lock_irqsave(&req->txcmp_lock, flags);
1424                 if (!list_empty(&req->txcmp)) {
1425                         tx = list_first_entry(&req->txcmp,
1426                                               struct user_sdma_txreq, list);
1427                         list_del(&tx->list);
1428                 }
1429                 spin_unlock_irqrestore(&req->txcmp_lock, flags);
1430                 if (!tx)
1431                         break;
1432
1433                 for (i = tx->idx; i >= 0; i--)
1434                         if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT)
1435                                 unpin_vector_pages(req, tx->iovecs[i].vec);
1436
1437                 seqnum = tx->seqnum;
1438                 kmem_cache_free(pq->txreq_cache, tx);
1439                 tx = NULL;
1440
1441                 if (req->status != SDMA_TXREQ_S_OK) {
1442                         if (seqnum == ACCESS_ONCE(req->seqnum) &&
1443                             test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
1444                                 atomic_dec(&pq->n_reqs);
1445                                 set_comp_state(req, ERROR, req->status);
1446                                 user_sdma_free_request(req);
1447                                 break;
1448                         }
1449                 } else {
1450                         if (seqnum == req->info.npkts - 1) {
1451                                 atomic_dec(&pq->n_reqs);
1452                                 set_comp_state(req, COMPLETE, 0);
1453                                 user_sdma_free_request(req);
1454                                 break;
1455                         }
1456                 }
1457         }
1458
1459         if (!atomic_read(&pq->n_reqs)) {
1460                 xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
1461                 wake_up(&pq->wait);
1462         }
1463 }
1464
1465 static void user_sdma_free_request(struct user_sdma_request *req)
1466 {
1467         if (!list_empty(&req->txps)) {
1468                 struct sdma_txreq *t, *p;
1469
1470                 list_for_each_entry_safe(t, p, &req->txps, list) {
1471                         struct user_sdma_txreq *tx =
1472                                 container_of(t, struct user_sdma_txreq, txreq);
1473                         list_del_init(&t->list);
1474                         sdma_txclean(req->pq->dd, t);
1475                         kmem_cache_free(req->pq->txreq_cache, tx);
1476                 }
1477         }
1478         if (req->data_iovs) {
1479                 int i;
1480
1481                 for (i = 0; i < req->data_iovs; i++)
1482                         if (req->iovs[i].npages && req->iovs[i].pages)
1483                                 unpin_vector_pages(req, &req->iovs[i]);
1484         }
1485         kfree(req->tids);
1486         clear_bit(SDMA_REQ_IN_USE, &req->flags);
1487 }
1488
1489 static inline void set_comp_state(struct user_sdma_request *req,
1490                                         enum hfi1_sdma_comp_state state,
1491                                         int ret)
1492 {
1493         SDMA_DBG(req, "Setting completion status %u %d", state, ret);
1494         req->cq->comps[req->info.comp_idx].status = state;
1495         if (state == ERROR)
1496                 req->cq->comps[req->info.comp_idx].errcode = -ret;
1497         trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt,
1498                                         req->pq->subctxt, req->info.comp_idx,
1499                                         state, ret);
1500 }