Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 5 Aug 2016 00:10:31 +0000 (20:10 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 5 Aug 2016 00:10:31 +0000 (20:10 -0400)
Pull base rdma updates from Doug Ledford:
 "Round one of 4.8 code: while this is mostly normal, there is a new
  driver in here (the driver was hosted outside the kernel for several
  years and is actually a fairly mature and well coded driver).  It
  amounts to 13,000 of the 16,000 lines of added code in here.

  Summary:

   - Updates/fixes for iw_cxgb4 driver
   - Updates/fixes for mlx5 driver
   - Add flow steering and RSS API
   - Add hardware stats to mlx4 and mlx5 drivers
   - Add firmware version API for RDMA driver use
   - Add the rxe driver (this is a software RoCE driver that makes any
     Ethernet device a RoCE device)
   - Fixes for i40iw driver
   - Support for send only multicast joins in the cma layer
   - Other minor fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (72 commits)
  Soft RoCE driver
  IB/core: Support for CMA multicast join flags
  IB/sa: Add cached attribute containing SM information to SA port
  IB/uverbs: Fix race between uverbs_close and remove_one
  IB/mthca: Clean up error unwind flow in mthca_reset()
  IB/mthca: NULL arg to pci_dev_put is OK
  IB/hfi1: NULL arg to sc_return_credits is OK
  IB/mlx4: Add diagnostic hardware counters
  net/mlx4: Query performance and diagnostics counters
  net/mlx4: Add diagnostic counters capability bit
  Use smaller 512 byte messages for portmapper messages
  IB/ipoib: Report SG feature regardless of HW UD CSUM capability
  IB/mlx4: Don't use GFP_ATOMIC for CQ resize struct
  IB/hfi1: Disable by default
  IB/rdmavt: Disable by default
  IB/mlx5: Fix port counter ID association to QP offset
  IB/mlx5: Fix iteration overrun in GSI qps
  i40iw: Add NULL check for puda buffer
  i40iw: Change dup_ack_thresh to u8
  i40iw: Remove unnecessary check for moving CQ head
  ...

107 files changed:
MAINTAINERS
drivers/infiniband/Kconfig
drivers/infiniband/core/cma.c
drivers/infiniband/core/device.c
drivers/infiniband/core/iwcm.c
drivers/infiniband/core/iwcm.h
drivers/infiniband/core/iwpm_util.c
drivers/infiniband/core/multicast.c
drivers/infiniband/core/netlink.c
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ucma.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/cxgb3/iwch_cm.c
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/cq.c
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/cxgb4/iw_cxgb4.h
drivers/infiniband/hw/cxgb4/mem.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/hfi1/Kconfig
drivers/infiniband/hw/hfi1/file_ops.c
drivers/infiniband/hw/hfi1/hfi.h
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/i40iw/i40iw_cm.c
drivers/infiniband/hw/i40iw/i40iw_d.h
drivers/infiniband/hw/i40iw/i40iw_puda.c
drivers/infiniband/hw/i40iw/i40iw_type.h
drivers/infiniband/hw/i40iw/i40iw_uk.c
drivers/infiniband/hw/i40iw/i40iw_user.h
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/gsi.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mlx5/srq.c
drivers/infiniband/hw/mlx5/user.h
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/mthca/mthca_reset.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/usnic/usnic_ib_main.c
drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
drivers/infiniband/sw/Makefile
drivers/infiniband/sw/rdmavt/Kconfig
drivers/infiniband/sw/rxe/Kconfig [new file with mode: 0644]
drivers/infiniband/sw/rxe/Makefile [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_av.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_comp.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_cq.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_dma.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_hdr.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_icrc.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_loc.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_mcast.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_mmap.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_mr.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_net.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_net.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_opcode.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_opcode.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_param.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_pool.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_pool.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_qp.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_queue.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_queue.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_recv.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_req.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_resp.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_srq.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_sysfs.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_task.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_task.h [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_verbs.c [new file with mode: 0644]
drivers/infiniband/sw/rxe/rxe_verbs.h [new file with mode: 0644]
drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
drivers/net/ethernet/mellanox/mlx4/fw.c
drivers/net/ethernet/mellanox/mlx5/core/srq.c
drivers/net/ethernet/mellanox/mlx5/core/transobj.c
include/linux/mlx4/device.h
include/linux/mlx5/cq.h
include/linux/mlx5/driver.h
include/linux/mlx5/qp.h
include/linux/mlx5/srq.h
include/rdma/ib_sa.h
include/rdma/ib_verbs.h
include/rdma/rdma_cm.h
include/uapi/rdma/Kbuild
include/uapi/rdma/ib_user_verbs.h
include/uapi/rdma/rdma_user_cm.h
include/uapi/rdma/rdma_user_rxe.h [new file with mode: 0644]

index f518e69..5e1f03f 100644 (file)
@@ -7647,6 +7647,15 @@ W:       http://www.mellanox.com
 Q:     http://patchwork.ozlabs.org/project/netdev/list/
 F:     drivers/net/ethernet/mellanox/mlxsw/
 
+SOFT-ROCE DRIVER (rxe)
+M:     Moni Shoua <monis@mellanox.com>
+L:     linux-rdma@vger.kernel.org
+S:     Supported
+W:     https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home
+Q:     http://patchwork.kernel.org/project/linux-rdma/list/
+F:     drivers/infiniband/hw/rxe/
+F:     include/uapi/rdma/rdma_user_rxe.h
+
 MEMBARRIER SUPPORT
 M:     Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 M:     "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
index 2137adf..e9b7dc0 100644 (file)
@@ -84,6 +84,7 @@ source "drivers/infiniband/ulp/iser/Kconfig"
 source "drivers/infiniband/ulp/isert/Kconfig"
 
 source "drivers/infiniband/sw/rdmavt/Kconfig"
+source "drivers/infiniband/sw/rxe/Kconfig"
 
 source "drivers/infiniband/hw/hfi1/Kconfig"
 
index ad1b1ad..e6dfa1b 100644 (file)
@@ -68,6 +68,7 @@ MODULE_DESCRIPTION("Generic RDMA CM Agent");
 MODULE_LICENSE("Dual BSD/GPL");
 
 #define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
 #define CMA_IBOE_PACKET_LIFETIME 18
@@ -162,6 +163,14 @@ struct rdma_bind_list {
        unsigned short          port;
 };
 
+struct class_port_info_context {
+       struct ib_class_port_info       *class_port_info;
+       struct ib_device                *device;
+       struct completion               done;
+       struct ib_sa_query              *sa_query;
+       u8                              port_num;
+};
+
 static int cma_ps_alloc(struct net *net, enum rdma_port_space ps,
                        struct rdma_bind_list *bind_list, int snum)
 {
@@ -306,6 +315,7 @@ struct cma_multicast {
        struct sockaddr_storage addr;
        struct kref             mcref;
        bool                    igmp_joined;
+       u8                      join_state;
 };
 
 struct cma_work {
@@ -3752,10 +3762,63 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
        }
 }
 
+static void cma_query_sa_classport_info_cb(int status,
+                                          struct ib_class_port_info *rec,
+                                          void *context)
+{
+       struct class_port_info_context *cb_ctx = context;
+
+       WARN_ON(!context);
+
+       if (status || !rec) {
+               pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n",
+                        cb_ctx->device->name, cb_ctx->port_num, status);
+               goto out;
+       }
+
+       memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info));
+
+out:
+       complete(&cb_ctx->done);
+}
+
+static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num,
+                                      struct ib_class_port_info *class_port_info)
+{
+       struct class_port_info_context *cb_ctx;
+       int ret;
+
+       cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL);
+       if (!cb_ctx)
+               return -ENOMEM;
+
+       cb_ctx->device = device;
+       cb_ctx->class_port_info = class_port_info;
+       cb_ctx->port_num = port_num;
+       init_completion(&cb_ctx->done);
+
+       ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num,
+                                            CMA_QUERY_CLASSPORT_INFO_TIMEOUT,
+                                            GFP_KERNEL, cma_query_sa_classport_info_cb,
+                                            cb_ctx, &cb_ctx->sa_query);
+       if (ret < 0) {
+               pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n",
+                      device->name, port_num, ret);
+               goto out;
+       }
+
+       wait_for_completion(&cb_ctx->done);
+
+out:
+       kfree(cb_ctx);
+       return ret;
+}
+
 static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
                                 struct cma_multicast *mc)
 {
        struct ib_sa_mcmember_rec rec;
+       struct ib_class_port_info class_port_info;
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        ib_sa_comp_mask comp_mask;
        int ret;
@@ -3774,7 +3837,24 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
        rec.qkey = cpu_to_be32(id_priv->qkey);
        rdma_addr_get_sgid(dev_addr, &rec.port_gid);
        rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
-       rec.join_state = 1;
+       rec.join_state = mc->join_state;
+
+       if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) {
+               ret = cma_query_sa_classport_info(id_priv->id.device,
+                                                 id_priv->id.port_num,
+                                                 &class_port_info);
+
+               if (ret)
+                       return ret;
+
+               if (!(ib_get_cpi_capmask2(&class_port_info) &
+                     IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) {
+                       pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
+                               "RDMA CM: SM doesn't support Send Only Full Member option\n",
+                               id_priv->id.device->name, id_priv->id.port_num);
+                       return -EOPNOTSUPP;
+               }
+       }
 
        comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
                    IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
@@ -3843,6 +3923,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
        struct sockaddr *addr = (struct sockaddr *)&mc->addr;
        struct net_device *ndev = NULL;
        enum ib_gid_type gid_type;
+       bool send_only;
+
+       send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
 
        if (cma_zero_addr((struct sockaddr *)&mc->addr))
                return -EINVAL;
@@ -3878,10 +3961,12 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
        if (addr->sa_family == AF_INET) {
                if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
                        mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
-                       err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
-                                           true);
-                       if (!err)
-                               mc->igmp_joined = true;
+                       if (!send_only) {
+                               err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+                                                   true);
+                               if (!err)
+                                       mc->igmp_joined = true;
+                       }
                }
        } else {
                if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
@@ -3911,7 +3996,7 @@ out1:
 }
 
 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
-                       void *context)
+                       u8 join_state, void *context)
 {
        struct rdma_id_private *id_priv;
        struct cma_multicast *mc;
@@ -3930,6 +4015,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
        mc->context = context;
        mc->id_priv = id_priv;
        mc->igmp_joined = false;
+       mc->join_state = join_state;
        spin_lock(&id_priv->lock);
        list_add(&mc->list, &id_priv->mc_list);
        spin_unlock(&id_priv->lock);
index 5c155fa..760ef60 100644 (file)
@@ -311,6 +311,15 @@ static int read_port_immutable(struct ib_device *device)
        return 0;
 }
 
+void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len)
+{
+       if (dev->get_dev_fw_str)
+               dev->get_dev_fw_str(dev, str, str_len);
+       else
+               str[0] = '\0';
+}
+EXPORT_SYMBOL(ib_get_device_fw_str);
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
index f057204..357624f 100644 (file)
@@ -183,15 +183,14 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
 
 /*
  * Release a reference on cm_id. If the last reference is being
- * released, enable the waiting thread (in iw_destroy_cm_id) to
- * get woken up, and return 1 if a thread is already waiting.
+ * released, free the cm_id and return 1.
  */
 static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
 {
        BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
        if (atomic_dec_and_test(&cm_id_priv->refcount)) {
                BUG_ON(!list_empty(&cm_id_priv->work_list));
-               complete(&cm_id_priv->destroy_comp);
+               free_cm_id(cm_id_priv);
                return 1;
        }
 
@@ -208,19 +207,10 @@ static void add_ref(struct iw_cm_id *cm_id)
 static void rem_ref(struct iw_cm_id *cm_id)
 {
        struct iwcm_id_private *cm_id_priv;
-       int cb_destroy;
 
        cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
-       /*
-        * Test bit before deref in case the cm_id gets freed on another
-        * thread.
-        */
-       cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-       if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
-               BUG_ON(!list_empty(&cm_id_priv->work_list));
-               free_cm_id(cm_id_priv);
-       }
+       (void)iwcm_deref_id(cm_id_priv);
 }
 
 static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
@@ -370,6 +360,12 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
        wait_event(cm_id_priv->connect_wait,
                   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
 
+       /*
+        * Since we're deleting the cm_id, drop any events that
+        * might arrive before the last dereference.
+        */
+       set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
+
        spin_lock_irqsave(&cm_id_priv->lock, flags);
        switch (cm_id_priv->state) {
        case IW_CM_STATE_LISTEN:
@@ -433,13 +429,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id)
        struct iwcm_id_private *cm_id_priv;
 
        cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
-       BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
-
        destroy_cm_id(cm_id);
-
-       wait_for_completion(&cm_id_priv->destroy_comp);
-
-       free_cm_id(cm_id_priv);
 }
 EXPORT_SYMBOL(iw_destroy_cm_id);
 
@@ -809,10 +799,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
        ret = cm_id->cm_handler(cm_id, iw_event);
        if (ret) {
                iw_cm_reject(cm_id, NULL, 0);
-               set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-               destroy_cm_id(cm_id);
-               if (atomic_read(&cm_id_priv->refcount)==0)
-                       free_cm_id(cm_id_priv);
+               iw_destroy_cm_id(cm_id);
        }
 
 out:
@@ -1000,7 +987,6 @@ static void cm_work_handler(struct work_struct *_work)
        unsigned long flags;
        int empty;
        int ret = 0;
-       int destroy_id;
 
        spin_lock_irqsave(&cm_id_priv->lock, flags);
        empty = list_empty(&cm_id_priv->work_list);
@@ -1013,20 +999,14 @@ static void cm_work_handler(struct work_struct *_work)
                put_work(work);
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
-               ret = process_event(cm_id_priv, &levent);
-               if (ret) {
-                       set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-                       destroy_cm_id(&cm_id_priv->id);
-               }
-               BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
-               destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-               if (iwcm_deref_id(cm_id_priv)) {
-                       if (destroy_id) {
-                               BUG_ON(!list_empty(&cm_id_priv->work_list));
-                               free_cm_id(cm_id_priv);
-                       }
+               if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+                       ret = process_event(cm_id_priv, &levent);
+                       if (ret)
+                               destroy_cm_id(&cm_id_priv->id);
+               } else
+                       pr_debug("dropping event %d\n", levent.event);
+               if (iwcm_deref_id(cm_id_priv))
                        return;
-               }
                if (empty)
                        return;
                spin_lock_irqsave(&cm_id_priv->lock, flags);
index 3f6cc82..82c2cd1 100644 (file)
@@ -56,7 +56,7 @@ struct iwcm_id_private {
        struct list_head work_free_list;
 };
 
-#define IWCM_F_CALLBACK_DESTROY   1
+#define IWCM_F_DROP_EVENTS       1
 #define IWCM_F_CONNECT_WAIT       2
 
 #endif /* IWCM_H */
index b65e06c..ade71e7 100644 (file)
@@ -37,6 +37,7 @@
 #define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1)
 #define IWPM_REMINFO_HASH_SIZE 64
 #define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1)
+#define IWPM_MSG_SIZE          512
 
 static LIST_HEAD(iwpm_nlmsg_req_list);
 static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
@@ -452,7 +453,7 @@ struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
 {
        struct sk_buff *skb = NULL;
 
-       skb = dev_alloc_skb(NLMSG_GOODSIZE);
+       skb = dev_alloc_skb(IWPM_MSG_SIZE);
        if (!skb) {
                pr_err("%s Unable to allocate skb\n", __func__);
                goto create_nlmsg_exit;
index a83ec28..3a3c5d7 100644 (file)
@@ -93,18 +93,6 @@ enum {
 
 struct mcast_member;
 
-/*
-* There are 4 types of join states:
-* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
-*/
-enum {
-       FULLMEMBER_JOIN,
-       NONMEMBER_JOIN,
-       SENDONLY_NONMEBER_JOIN,
-       SENDONLY_FULLMEMBER_JOIN,
-       NUM_JOIN_MEMBERSHIP_TYPES,
-};
-
 struct mcast_group {
        struct ib_sa_mcmember_rec rec;
        struct rb_node          node;
index 9b8c20c..10469b0 100644 (file)
@@ -229,7 +229,10 @@ static void ibnl_rcv(struct sk_buff *skb)
 int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
                        __u32 pid)
 {
-       return nlmsg_unicast(nls, skb, pid);
+       int err;
+
+       err = netlink_unicast(nls, skb, pid, 0);
+       return (err < 0) ? err : 0;
 }
 EXPORT_SYMBOL(ibnl_unicast);
 
@@ -252,6 +255,7 @@ int __init ibnl_init(void)
                return -ENOMEM;
        }
 
+       nls->sk_sndtimeo = 10 * HZ;
        return 0;
 }
 
index e955386..b9bf7aa 100644 (file)
@@ -65,10 +65,17 @@ struct ib_sa_sm_ah {
        u8                   src_path_mask;
 };
 
+struct ib_sa_classport_cache {
+       bool valid;
+       struct ib_class_port_info data;
+};
+
 struct ib_sa_port {
        struct ib_mad_agent *agent;
        struct ib_sa_sm_ah  *sm_ah;
        struct work_struct   update_task;
+       struct ib_sa_classport_cache classport_info;
+       spinlock_t                   classport_lock; /* protects class port info set */
        spinlock_t           ah_lock;
        u8                   port_num;
 };
@@ -998,6 +1005,13 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
                port->sm_ah = NULL;
                spin_unlock_irqrestore(&port->ah_lock, flags);
 
+               if (event->event == IB_EVENT_SM_CHANGE ||
+                   event->event == IB_EVENT_CLIENT_REREGISTER ||
+                   event->event == IB_EVENT_LID_CHANGE) {
+                       spin_lock_irqsave(&port->classport_lock, flags);
+                       port->classport_info.valid = false;
+                       spin_unlock_irqrestore(&port->classport_lock, flags);
+               }
                queue_work(ib_wq, &sa_dev->port[event->element.port_num -
                                            sa_dev->start_port].update_task);
        }
@@ -1719,6 +1733,7 @@ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
                                              int status,
                                              struct ib_sa_mad *mad)
 {
+       unsigned long flags;
        struct ib_sa_classport_info_query *query =
                container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
 
@@ -1728,6 +1743,16 @@ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
                ib_unpack(classport_info_rec_table,
                          ARRAY_SIZE(classport_info_rec_table),
                          mad->data, &rec);
+
+               spin_lock_irqsave(&sa_query->port->classport_lock, flags);
+               if (!status && !sa_query->port->classport_info.valid) {
+                       memcpy(&sa_query->port->classport_info.data, &rec,
+                              sizeof(sa_query->port->classport_info.data));
+
+                       sa_query->port->classport_info.valid = true;
+               }
+               spin_unlock_irqrestore(&sa_query->port->classport_lock, flags);
+
                query->callback(status, &rec, query->context);
        } else {
                query->callback(status, NULL, query->context);
@@ -1754,7 +1779,9 @@ int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
        struct ib_sa_port *port;
        struct ib_mad_agent *agent;
        struct ib_sa_mad *mad;
+       struct ib_class_port_info cached_class_port_info;
        int ret;
+       unsigned long flags;
 
        if (!sa_dev)
                return -ENODEV;
@@ -1762,6 +1789,17 @@ int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
        port  = &sa_dev->port[port_num - sa_dev->start_port];
        agent = port->agent;
 
+       /* Use cached ClassPortInfo attribute if valid instead of sending mad */
+       spin_lock_irqsave(&port->classport_lock, flags);
+       if (port->classport_info.valid && callback) {
+               memcpy(&cached_class_port_info, &port->classport_info.data,
+                      sizeof(cached_class_port_info));
+               spin_unlock_irqrestore(&port->classport_lock, flags);
+               callback(0, &cached_class_port_info, context);
+               return 0;
+       }
+       spin_unlock_irqrestore(&port->classport_lock, flags);
+
        query = kzalloc(sizeof(*query), gfp_mask);
        if (!query)
                return -ENOMEM;
@@ -1885,6 +1923,9 @@ static void ib_sa_add_one(struct ib_device *device)
                sa_dev->port[i].sm_ah    = NULL;
                sa_dev->port[i].port_num = i + s;
 
+               spin_lock_init(&sa_dev->port[i].classport_lock);
+               sa_dev->port[i].classport_info.valid = false;
+
                sa_dev->port[i].agent =
                        ib_register_mad_agent(device, i + s, IB_QPT_GSI,
                                              NULL, 0, send_handler,
index 60df4f8..15defef 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/netdevice.h>
+#include <linux/ethtool.h>
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_pma.h>
@@ -1200,16 +1201,28 @@ static ssize_t set_node_desc(struct device *device,
        return count;
 }
 
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+                          char *buf)
+{
+       struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+       ib_get_device_fw_str(dev, buf, PAGE_SIZE);
+       strlcat(buf, "\n", PAGE_SIZE);
+       return strlen(buf);
+}
+
 static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
 static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
 static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
 static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
 
 static struct device_attribute *ib_class_attributes[] = {
        &dev_attr_node_type,
        &dev_attr_sys_image_guid,
        &dev_attr_node_guid,
-       &dev_attr_node_desc
+       &dev_attr_node_desc,
+       &dev_attr_fw_ver,
 };
 
 static void free_port_list_attributes(struct ib_device *device)
index c0f3826..2825ece 100644 (file)
@@ -106,6 +106,7 @@ struct ucma_multicast {
        int                     events_reported;
 
        u64                     uid;
+       u8                      join_state;
        struct list_head        list;
        struct sockaddr_storage addr;
 };
@@ -1317,12 +1318,20 @@ static ssize_t ucma_process_join(struct ucma_file *file,
        struct ucma_multicast *mc;
        struct sockaddr *addr;
        int ret;
+       u8 join_state;
 
        if (out_len < sizeof(resp))
                return -ENOSPC;
 
        addr = (struct sockaddr *) &cmd->addr;
-       if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+       if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+               return -EINVAL;
+
+       if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
+               join_state = BIT(FULLMEMBER_JOIN);
+       else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
+               join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
+       else
                return -EINVAL;
 
        ctx = ucma_get_ctx(file, cmd->id);
@@ -1335,10 +1344,11 @@ static ssize_t ucma_process_join(struct ucma_file *file,
                ret = -ENOMEM;
                goto err1;
        }
-
+       mc->join_state = join_state;
        mc->uid = cmd->uid;
        memcpy(&mc->addr, addr, cmd->addr_size);
-       ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+       ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
+                                 join_state, mc);
        if (ret)
                goto err2;
 
@@ -1382,7 +1392,7 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
        join_cmd.uid = cmd.uid;
        join_cmd.id = cmd.id;
        join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr);
-       join_cmd.reserved = 0;
+       join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
        memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
 
        return ucma_process_join(file, &join_cmd, out_len);
index 612ccfd..df26a74 100644 (file)
@@ -116,6 +116,7 @@ struct ib_uverbs_event_file {
 struct ib_uverbs_file {
        struct kref                             ref;
        struct mutex                            mutex;
+       struct mutex                            cleanup_mutex; /* protect cleanup */
        struct ib_uverbs_device                *device;
        struct ib_ucontext                     *ucontext;
        struct ib_event_handler                 event_handler;
@@ -162,6 +163,10 @@ struct ib_uqp_object {
        struct ib_uxrcd_object *uxrcd;
 };
 
+struct ib_uwq_object {
+       struct ib_uevent_object uevent;
+};
+
 struct ib_ucq_object {
        struct ib_uobject       uobject;
        struct ib_uverbs_file  *uverbs_file;
@@ -181,6 +186,8 @@ extern struct idr ib_uverbs_qp_idr;
 extern struct idr ib_uverbs_srq_idr;
 extern struct idr ib_uverbs_xrcd_idr;
 extern struct idr ib_uverbs_rule_idr;
+extern struct idr ib_uverbs_wq_idr;
+extern struct idr ib_uverbs_rwq_ind_tbl_idr;
 
 void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
 
@@ -199,6 +206,7 @@ void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
 void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
 void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
 void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr);
 void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
 void ib_uverbs_event_handler(struct ib_event_handler *handler,
                             struct ib_event *event);
@@ -219,6 +227,7 @@ struct ib_uverbs_flow_spec {
                struct ib_uverbs_flow_spec_eth     eth;
                struct ib_uverbs_flow_spec_ipv4    ipv4;
                struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+               struct ib_uverbs_flow_spec_ipv6    ipv6;
        };
 };
 
@@ -275,5 +284,10 @@ IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
 IB_UVERBS_DECLARE_EX_CMD(query_device);
 IB_UVERBS_DECLARE_EX_CMD(create_cq);
 IB_UVERBS_DECLARE_EX_CMD(create_qp);
+IB_UVERBS_DECLARE_EX_CMD(create_wq);
+IB_UVERBS_DECLARE_EX_CMD(modify_wq);
+IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
+IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
 
 #endif /* UVERBS_H */
index 825021d..f664731 100644 (file)
@@ -57,6 +57,8 @@ static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" };
 static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
 static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
 static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" };
+static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" };
 
 /*
  * The ib_uobject locking scheme is as follows:
@@ -243,6 +245,27 @@ static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
        return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
 }
 
+static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0);
+}
+
+static void put_wq_read(struct ib_wq *wq)
+{
+       put_uobj_read(wq->uobject);
+}
+
+static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle,
+                                                              struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0);
+}
+
+static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table)
+{
+       put_uobj_read(ind_table->uobject);
+}
+
 static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
 {
        struct ib_uobject *uobj;
@@ -326,6 +349,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        INIT_LIST_HEAD(&ucontext->qp_list);
        INIT_LIST_HEAD(&ucontext->srq_list);
        INIT_LIST_HEAD(&ucontext->ah_list);
+       INIT_LIST_HEAD(&ucontext->wq_list);
+       INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list);
        INIT_LIST_HEAD(&ucontext->xrcd_list);
        INIT_LIST_HEAD(&ucontext->rule_list);
        rcu_read_lock();
@@ -1750,6 +1775,8 @@ static int create_qp(struct ib_uverbs_file *file,
        struct ib_qp_init_attr          attr = {};
        struct ib_uverbs_ex_create_qp_resp resp;
        int                             ret;
+       struct ib_rwq_ind_table *ind_tbl = NULL;
+       bool has_sq = true;
 
        if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
                return -EPERM;
@@ -1761,6 +1788,32 @@ static int create_qp(struct ib_uverbs_file *file,
        init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext,
                  &qp_lock_class);
        down_write(&obj->uevent.uobject.mutex);
+       if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
+                     sizeof(cmd->rwq_ind_tbl_handle) &&
+                     (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+               ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle,
+                                                        file->ucontext);
+               if (!ind_tbl) {
+                       ret = -EINVAL;
+                       goto err_put;
+               }
+
+               attr.rwq_ind_tbl = ind_tbl;
+       }
+
+       if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) +
+                      sizeof(cmd->reserved1)) && cmd->reserved1) {
+               ret = -EOPNOTSUPP;
+               goto err_put;
+       }
+
+       if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
+               ret = -EINVAL;
+               goto err_put;
+       }
+
+       if (ind_tbl && !cmd->max_send_wr)
+               has_sq = false;
 
        if (cmd->qp_type == IB_QPT_XRC_TGT) {
                xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext,
@@ -1784,20 +1837,24 @@ static int create_qp(struct ib_uverbs_file *file,
                                }
                        }
 
-                       if (cmd->recv_cq_handle != cmd->send_cq_handle) {
-                               rcq = idr_read_cq(cmd->recv_cq_handle,
-                                                 file->ucontext, 0);
-                               if (!rcq) {
-                                       ret = -EINVAL;
-                                       goto err_put;
+                       if (!ind_tbl) {
+                               if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+                                       rcq = idr_read_cq(cmd->recv_cq_handle,
+                                                         file->ucontext, 0);
+                                       if (!rcq) {
+                                               ret = -EINVAL;
+                                               goto err_put;
+                                       }
                                }
                        }
                }
 
-               scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
-               rcq = rcq ?: scq;
+               if (has_sq)
+                       scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
+               if (!ind_tbl)
+                       rcq = rcq ?: scq;
                pd  = idr_read_pd(cmd->pd_handle, file->ucontext);
-               if (!pd || !scq) {
+               if (!pd || (!scq && has_sq)) {
                        ret = -EINVAL;
                        goto err_put;
                }
@@ -1864,16 +1921,20 @@ static int create_qp(struct ib_uverbs_file *file,
                qp->send_cq       = attr.send_cq;
                qp->recv_cq       = attr.recv_cq;
                qp->srq           = attr.srq;
+               qp->rwq_ind_tbl   = ind_tbl;
                qp->event_handler = attr.event_handler;
                qp->qp_context    = attr.qp_context;
                qp->qp_type       = attr.qp_type;
                atomic_set(&qp->usecnt, 0);
                atomic_inc(&pd->usecnt);
-               atomic_inc(&attr.send_cq->usecnt);
+               if (attr.send_cq)
+                       atomic_inc(&attr.send_cq->usecnt);
                if (attr.recv_cq)
                        atomic_inc(&attr.recv_cq->usecnt);
                if (attr.srq)
                        atomic_inc(&attr.srq->usecnt);
+               if (ind_tbl)
+                       atomic_inc(&ind_tbl->usecnt);
        }
        qp->uobject = &obj->uevent.uobject;
 
@@ -1913,6 +1974,8 @@ static int create_qp(struct ib_uverbs_file *file,
                put_cq_read(rcq);
        if (srq)
                put_srq_read(srq);
+       if (ind_tbl)
+               put_rwq_indirection_table_read(ind_tbl);
 
        mutex_lock(&file->mutex);
        list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
@@ -1940,6 +2003,8 @@ err_put:
                put_cq_read(rcq);
        if (srq)
                put_srq_read(srq);
+       if (ind_tbl)
+               put_rwq_indirection_table_read(ind_tbl);
 
        put_uobj_write(&obj->uevent.uobject);
        return ret;
@@ -2033,7 +2098,7 @@ int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
        if (err)
                return err;
 
-       if (cmd.comp_mask)
+       if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
                return -EINVAL;
 
        if (cmd.reserved)
@@ -3040,6 +3105,15 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
                memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
                       sizeof(struct ib_flow_ipv4_filter));
                break;
+       case IB_FLOW_SPEC_IPV6:
+               ib_spec->ipv6.size = sizeof(struct ib_flow_spec_ipv6);
+               if (ib_spec->ipv6.size != kern_spec->ipv6.size)
+                       return -EINVAL;
+               memcpy(&ib_spec->ipv6.val, &kern_spec->ipv6.val,
+                      sizeof(struct ib_flow_ipv6_filter));
+               memcpy(&ib_spec->ipv6.mask, &kern_spec->ipv6.mask,
+                      sizeof(struct ib_flow_ipv6_filter));
+               break;
        case IB_FLOW_SPEC_TCP:
        case IB_FLOW_SPEC_UDP:
                ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
@@ -3056,6 +3130,445 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
        return 0;
 }
 
+int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
+                          struct ib_udata *ucore,
+                          struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_create_wq     cmd = {};
+       struct ib_uverbs_ex_create_wq_resp resp = {};
+       struct ib_uwq_object           *obj;
+       int err = 0;
+       struct ib_cq *cq;
+       struct ib_pd *pd;
+       struct ib_wq *wq;
+       struct ib_wq_init_attr wq_init_attr = {};
+       size_t required_cmd_sz;
+       size_t required_resp_len;
+
+       required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
+       required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
+
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->outlen < required_resp_len)
+               return -ENOSPC;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (err)
+               return err;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+       if (!obj)
+               return -ENOMEM;
+
+       init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext,
+                 &wq_lock_class);
+       down_write(&obj->uevent.uobject.mutex);
+       pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+       if (!pd) {
+               err = -EINVAL;
+               goto err_uobj;
+       }
+
+       cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+       if (!cq) {
+               err = -EINVAL;
+               goto err_put_pd;
+       }
+
+       wq_init_attr.cq = cq;
+       wq_init_attr.max_sge = cmd.max_sge;
+       wq_init_attr.max_wr = cmd.max_wr;
+       wq_init_attr.wq_context = file;
+       wq_init_attr.wq_type = cmd.wq_type;
+       wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+       obj->uevent.events_reported = 0;
+       INIT_LIST_HEAD(&obj->uevent.event_list);
+       wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+       if (IS_ERR(wq)) {
+               err = PTR_ERR(wq);
+               goto err_put_cq;
+       }
+
+       wq->uobject = &obj->uevent.uobject;
+       obj->uevent.uobject.object = wq;
+       wq->wq_type = wq_init_attr.wq_type;
+       wq->cq = cq;
+       wq->pd = pd;
+       wq->device = pd->device;
+       wq->wq_context = wq_init_attr.wq_context;
+       atomic_set(&wq->usecnt, 0);
+       atomic_inc(&pd->usecnt);
+       atomic_inc(&cq->usecnt);
+       wq->uobject = &obj->uevent.uobject;
+       obj->uevent.uobject.object = wq;
+       err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+       if (err)
+               goto destroy_wq;
+
+       memset(&resp, 0, sizeof(resp));
+       resp.wq_handle = obj->uevent.uobject.id;
+       resp.max_sge = wq_init_attr.max_sge;
+       resp.max_wr = wq_init_attr.max_wr;
+       resp.wqn = wq->wq_num;
+       resp.response_length = required_resp_len;
+       err = ib_copy_to_udata(ucore,
+                              &resp, resp.response_length);
+       if (err)
+               goto err_copy;
+
+       put_pd_read(pd);
+       put_cq_read(cq);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list);
+       mutex_unlock(&file->mutex);
+
+       obj->uevent.uobject.live = 1;
+       up_write(&obj->uevent.uobject.mutex);
+       return 0;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+destroy_wq:
+       ib_destroy_wq(wq);
+err_put_cq:
+       put_cq_read(cq);
+err_put_pd:
+       put_pd_read(pd);
+err_uobj:
+       put_uobj_write(&obj->uevent.uobject);
+
+       return err;
+}
+
+int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
+                           struct ib_udata *ucore,
+                           struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_destroy_wq  cmd = {};
+       struct ib_uverbs_ex_destroy_wq_resp     resp = {};
+       struct ib_wq                    *wq;
+       struct ib_uobject               *uobj;
+       struct ib_uwq_object            *obj;
+       size_t required_cmd_sz;
+       size_t required_resp_len;
+       int                             ret;
+
+       required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
+       required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->outlen < required_resp_len)
+               return -ENOSPC;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (ret)
+               return ret;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       resp.response_length = required_resp_len;
+       uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle,
+                             file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+
+       wq = uobj->object;
+       obj = container_of(uobj, struct ib_uwq_object, uevent.uobject);
+       ret = ib_destroy_wq(wq);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       ib_uverbs_release_uevent(file, &obj->uevent);
+       resp.events_reported = obj->uevent.events_reported;
+       put_uobj(uobj);
+
+       ret = ib_copy_to_udata(ucore, &resp, resp.response_length);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
+                          struct ib_udata *ucore,
+                          struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_modify_wq cmd = {};
+       struct ib_wq *wq;
+       struct ib_wq_attr wq_attr = {};
+       size_t required_cmd_sz;
+       int ret;
+
+       required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (ret)
+               return ret;
+
+       if (!cmd.attr_mask)
+               return -EINVAL;
+
+       if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE))
+               return -EINVAL;
+
+       wq = idr_read_wq(cmd.wq_handle, file->ucontext);
+       if (!wq)
+               return -EINVAL;
+
+       wq_attr.curr_wq_state = cmd.curr_wq_state;
+       wq_attr.wq_state = cmd.wq_state;
+       ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
+       put_wq_read(wq);
+       return ret;
+}
+
+int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
+                                     struct ib_device *ib_dev,
+                                     struct ib_udata *ucore,
+                                     struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_create_rwq_ind_table          cmd = {};
+       struct ib_uverbs_ex_create_rwq_ind_table_resp  resp = {};
+       struct ib_uobject                 *uobj;
+       int err = 0;
+       struct ib_rwq_ind_table_init_attr init_attr = {};
+       struct ib_rwq_ind_table *rwq_ind_tbl;
+       struct ib_wq    **wqs = NULL;
+       u32 *wqs_handles = NULL;
+       struct ib_wq    *wq = NULL;
+       int i, j, num_read_wqs;
+       u32 num_wq_handles;
+       u32 expected_in_size;
+       size_t required_cmd_sz_header;
+       size_t required_resp_len;
+
+       required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
+       required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
+
+       if (ucore->inlen < required_cmd_sz_header)
+               return -EINVAL;
+
+       if (ucore->outlen < required_resp_len)
+               return -ENOSPC;
+
+       err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+       if (err)
+               return err;
+
+       ucore->inbuf += required_cmd_sz_header;
+       ucore->inlen -= required_cmd_sz_header;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE)
+               return -EINVAL;
+
+       num_wq_handles = 1 << cmd.log_ind_tbl_size;
+       expected_in_size = num_wq_handles * sizeof(__u32);
+       if (num_wq_handles == 1)
+               /* input size for wq handles is u64 aligned */
+               expected_in_size += sizeof(__u32);
+
+       if (ucore->inlen < expected_in_size)
+               return -EINVAL;
+
+       if (ucore->inlen > expected_in_size &&
+           !ib_is_udata_cleared(ucore, expected_in_size,
+                                ucore->inlen - expected_in_size))
+               return -EOPNOTSUPP;
+
+       wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
+                             GFP_KERNEL);
+       if (!wqs_handles)
+               return -ENOMEM;
+
+       err = ib_copy_from_udata(wqs_handles, ucore,
+                                num_wq_handles * sizeof(__u32));
+       if (err)
+               goto err_free;
+
+       wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL);
+       if (!wqs) {
+               err = -ENOMEM;
+               goto  err_free;
+       }
+
+       for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
+                       num_read_wqs++) {
+               wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext);
+               if (!wq) {
+                       err = -EINVAL;
+                       goto put_wqs;
+               }
+
+               wqs[num_read_wqs] = wq;
+       }
+
+       uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+       if (!uobj) {
+               err = -ENOMEM;
+               goto put_wqs;
+       }
+
+       init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class);
+       down_write(&uobj->mutex);
+       init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
+       init_attr.ind_tbl = wqs;
+       rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
+
+       if (IS_ERR(rwq_ind_tbl)) {
+               err = PTR_ERR(rwq_ind_tbl);
+               goto err_uobj;
+       }
+
+       rwq_ind_tbl->ind_tbl = wqs;
+       rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size;
+       rwq_ind_tbl->uobject = uobj;
+       uobj->object = rwq_ind_tbl;
+       rwq_ind_tbl->device = ib_dev;
+       atomic_set(&rwq_ind_tbl->usecnt, 0);
+
+       for (i = 0; i < num_wq_handles; i++)
+               atomic_inc(&wqs[i]->usecnt);
+
+       err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+       if (err)
+               goto destroy_ind_tbl;
+
+       resp.ind_tbl_handle = uobj->id;
+       resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
+       resp.response_length = required_resp_len;
+
+       err = ib_copy_to_udata(ucore,
+                              &resp, resp.response_length);
+       if (err)
+               goto err_copy;
+
+       kfree(wqs_handles);
+
+       for (j = 0; j < num_read_wqs; j++)
+               put_wq_read(wqs[j]);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list);
+       mutex_unlock(&file->mutex);
+
+       uobj->live = 1;
+
+       up_write(&uobj->mutex);
+       return 0;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+destroy_ind_tbl:
+       ib_destroy_rwq_ind_table(rwq_ind_tbl);
+err_uobj:
+       put_uobj_write(uobj);
+put_wqs:
+       for (j = 0; j < num_read_wqs; j++)
+               put_wq_read(wqs[j]);
+err_free:
+       kfree(wqs_handles);
+       kfree(wqs);
+       return err;
+}
+
+int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
+                                      struct ib_device *ib_dev,
+                                      struct ib_udata *ucore,
+                                      struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_destroy_rwq_ind_table       cmd = {};
+       struct ib_rwq_ind_table *rwq_ind_tbl;
+       struct ib_uobject               *uobj;
+       int                     ret;
+       struct ib_wq    **ind_tbl;
+       size_t required_cmd_sz;
+
+       required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
+
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (ret)
+               return ret;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle,
+                             file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+       rwq_ind_tbl = uobj->object;
+       ind_tbl = rwq_ind_tbl->ind_tbl;
+
+       ret = ib_destroy_rwq_ind_table(rwq_ind_tbl);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+       kfree(ind_tbl);
+       return ret;
+}
+
 int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
                             struct ib_device *ib_dev,
                             struct ib_udata *ucore,
index 31f422a..0012fa5 100644 (file)
@@ -76,6 +76,8 @@ DEFINE_IDR(ib_uverbs_qp_idr);
 DEFINE_IDR(ib_uverbs_srq_idr);
 DEFINE_IDR(ib_uverbs_xrcd_idr);
 DEFINE_IDR(ib_uverbs_rule_idr);
+DEFINE_IDR(ib_uverbs_wq_idr);
+DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr);
 
 static DEFINE_SPINLOCK(map_lock);
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -130,6 +132,11 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
        [IB_USER_VERBS_EX_CMD_QUERY_DEVICE]     = ib_uverbs_ex_query_device,
        [IB_USER_VERBS_EX_CMD_CREATE_CQ]        = ib_uverbs_ex_create_cq,
        [IB_USER_VERBS_EX_CMD_CREATE_QP]        = ib_uverbs_ex_create_qp,
+       [IB_USER_VERBS_EX_CMD_CREATE_WQ]        = ib_uverbs_ex_create_wq,
+       [IB_USER_VERBS_EX_CMD_MODIFY_WQ]        = ib_uverbs_ex_modify_wq,
+       [IB_USER_VERBS_EX_CMD_DESTROY_WQ]       = ib_uverbs_ex_destroy_wq,
+       [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
+       [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
@@ -265,6 +272,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
                kfree(uqp);
        }
 
+       list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) {
+               struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object;
+               struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
+
+               idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+               ib_destroy_rwq_ind_table(rwq_ind_tbl);
+               kfree(ind_tbl);
+               kfree(uobj);
+       }
+
+       list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) {
+               struct ib_wq *wq = uobj->object;
+               struct ib_uwq_object *uwq =
+                       container_of(uobj, struct ib_uwq_object, uevent.uobject);
+
+               idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
+               ib_destroy_wq(wq);
+               ib_uverbs_release_uevent(file, &uwq->uevent);
+               kfree(uwq);
+       }
+
        list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
                struct ib_srq *srq = uobj->object;
                struct ib_uevent_object *uevent =
@@ -568,6 +596,16 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
                                &uobj->events_reported);
 }
 
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
+{
+       struct ib_uevent_object *uobj = container_of(event->element.wq->uobject,
+                                                 struct ib_uevent_object, uobject);
+
+       ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+                               event->event, &uobj->event_list,
+                               &uobj->events_reported);
+}
+
 void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
 {
        struct ib_uevent_object *uobj;
@@ -931,6 +969,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        file->async_file = NULL;
        kref_init(&file->ref);
        mutex_init(&file->mutex);
+       mutex_init(&file->cleanup_mutex);
 
        filp->private_data = file;
        kobject_get(&dev->kobj);
@@ -956,18 +995,20 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
        struct ib_uverbs_file *file = filp->private_data;
        struct ib_uverbs_device *dev = file->device;
-       struct ib_ucontext *ucontext = NULL;
+
+       mutex_lock(&file->cleanup_mutex);
+       if (file->ucontext) {
+               ib_uverbs_cleanup_ucontext(file, file->ucontext);
+               file->ucontext = NULL;
+       }
+       mutex_unlock(&file->cleanup_mutex);
 
        mutex_lock(&file->device->lists_mutex);
-       ucontext = file->ucontext;
-       file->ucontext = NULL;
        if (!file->is_closed) {
                list_del(&file->list);
                file->is_closed = 1;
        }
        mutex_unlock(&file->device->lists_mutex);
-       if (ucontext)
-               ib_uverbs_cleanup_ucontext(file, ucontext);
 
        if (file->async_file)
                kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
@@ -1181,22 +1222,30 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
        mutex_lock(&uverbs_dev->lists_mutex);
        while (!list_empty(&uverbs_dev->uverbs_file_list)) {
                struct ib_ucontext *ucontext;
-
                file = list_first_entry(&uverbs_dev->uverbs_file_list,
                                        struct ib_uverbs_file, list);
                file->is_closed = 1;
-               ucontext = file->ucontext;
                list_del(&file->list);
-               file->ucontext = NULL;
                kref_get(&file->ref);
                mutex_unlock(&uverbs_dev->lists_mutex);
-               /* We must release the mutex before going ahead and calling
-                * disassociate_ucontext. disassociate_ucontext might end up
-                * indirectly calling uverbs_close, for example due to freeing
-                * the resources (e.g mmput).
-                */
+
                ib_uverbs_event_handler(&file->event_handler, &event);
+
+               mutex_lock(&file->cleanup_mutex);
+               ucontext = file->ucontext;
+               file->ucontext = NULL;
+               mutex_unlock(&file->cleanup_mutex);
+
+               /* At this point ib_uverbs_close cannot be running
+                * ib_uverbs_cleanup_ucontext
+                */
                if (ucontext) {
+                       /* We must release the mutex before going ahead and
+                        * calling disassociate_ucontext. disassociate_ucontext
+                        * might end up indirectly calling uverbs_close,
+                        * for example due to freeing the resources
+                        * (e.g mmput).
+                        */
                        ib_dev->disassociate_ucontext(ucontext);
                        ib_uverbs_cleanup_ucontext(file, ucontext);
                }
index 6298f54..2e813ed 100644 (file)
@@ -758,6 +758,12 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
        struct ib_qp *qp;
        int ret;
 
+       if (qp_init_attr->rwq_ind_tbl &&
+           (qp_init_attr->recv_cq ||
+           qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
+           qp_init_attr->cap.max_recv_sge))
+               return ERR_PTR(-EINVAL);
+
        /*
         * If the callers is using the RDMA API calculate the resources
         * needed for the RDMA READ/WRITE operations.
@@ -775,6 +781,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
        qp->real_qp    = qp;
        qp->uobject    = NULL;
        qp->qp_type    = qp_init_attr->qp_type;
+       qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
 
        atomic_set(&qp->usecnt, 0);
        qp->mrs_used = 0;
@@ -792,7 +799,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
                qp->srq = NULL;
        } else {
                qp->recv_cq = qp_init_attr->recv_cq;
-               atomic_inc(&qp_init_attr->recv_cq->usecnt);
+               if (qp_init_attr->recv_cq)
+                       atomic_inc(&qp_init_attr->recv_cq->usecnt);
                qp->srq = qp_init_attr->srq;
                if (qp->srq)
                        atomic_inc(&qp_init_attr->srq->usecnt);
@@ -803,7 +811,10 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
        qp->xrcd    = NULL;
 
        atomic_inc(&pd->usecnt);
-       atomic_inc(&qp_init_attr->send_cq->usecnt);
+       if (qp_init_attr->send_cq)
+               atomic_inc(&qp_init_attr->send_cq->usecnt);
+       if (qp_init_attr->rwq_ind_tbl)
+               atomic_inc(&qp->rwq_ind_tbl->usecnt);
 
        if (qp_init_attr->cap.max_rdma_ctxs) {
                ret = rdma_rw_init_mrs(qp, qp_init_attr);
@@ -1283,6 +1294,7 @@ int ib_destroy_qp(struct ib_qp *qp)
        struct ib_pd *pd;
        struct ib_cq *scq, *rcq;
        struct ib_srq *srq;
+       struct ib_rwq_ind_table *ind_tbl;
        int ret;
 
        WARN_ON_ONCE(qp->mrs_used > 0);
@@ -1297,6 +1309,7 @@ int ib_destroy_qp(struct ib_qp *qp)
        scq  = qp->send_cq;
        rcq  = qp->recv_cq;
        srq  = qp->srq;
+       ind_tbl = qp->rwq_ind_tbl;
 
        if (!qp->uobject)
                rdma_rw_cleanup_mrs(qp);
@@ -1311,6 +1324,8 @@ int ib_destroy_qp(struct ib_qp *qp)
                        atomic_dec(&rcq->usecnt);
                if (srq)
                        atomic_dec(&srq->usecnt);
+               if (ind_tbl)
+                       atomic_dec(&ind_tbl->usecnt);
        }
 
        return ret;
@@ -1558,6 +1573,150 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 }
 EXPORT_SYMBOL(ib_dealloc_xrcd);
 
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_init_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_init_attr->max_wr and wq_init_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+                          struct ib_wq_init_attr *wq_attr)
+{
+       struct ib_wq *wq;
+
+       if (!pd->device->create_wq)
+               return ERR_PTR(-ENOSYS);
+
+       wq = pd->device->create_wq(pd, wq_attr, NULL);
+       if (!IS_ERR(wq)) {
+               wq->event_handler = wq_attr->event_handler;
+               wq->wq_context = wq_attr->wq_context;
+               wq->wq_type = wq_attr->wq_type;
+               wq->cq = wq_attr->cq;
+               wq->device = pd->device;
+               wq->pd = pd;
+               wq->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_inc(&wq_attr->cq->usecnt);
+               atomic_set(&wq->usecnt, 0);
+       }
+       return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq - Destroys the specified WQ.
+ * @wq: The WQ to destroy.
+ */
+int ib_destroy_wq(struct ib_wq *wq)
+{
+       int err;
+       struct ib_cq *cq = wq->cq;
+       struct ib_pd *pd = wq->pd;
+
+       if (atomic_read(&wq->usecnt))
+               return -EBUSY;
+
+       err = wq->device->destroy_wq(wq);
+       if (!err) {
+               atomic_dec(&pd->usecnt);
+               atomic_dec(&cq->usecnt);
+       }
+       return err;
+}
+EXPORT_SYMBOL(ib_destroy_wq);
+
+/**
+ * ib_modify_wq - Modifies the specified WQ.
+ * @wq: The WQ to modify.
+ * @wq_attr: On input, specifies the WQ attributes to modify.
+ * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
+ *   are being modified.
+ * On output, the current values of selected WQ attributes are returned.
+ */
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+                u32 wq_attr_mask)
+{
+       int err;
+
+       if (!wq->device->modify_wq)
+               return -ENOSYS;
+
+       err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+       return err;
+}
+EXPORT_SYMBOL(ib_modify_wq);
+
+/*
+ * ib_create_rwq_ind_table - Creates a RQ Indirection Table.
+ * @device: The device on which to create the rwq indirection table.
+ * @ib_rwq_ind_table_init_attr: A list of initial attributes required to
+ * create the Indirection Table.
+ *
+ * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less
+ *     than the created ib_rwq_ind_table object and the caller is responsible
+ *     for its memory allocation/free.
+ */
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+                                                struct ib_rwq_ind_table_init_attr *init_attr)
+{
+       struct ib_rwq_ind_table *rwq_ind_table;
+       int i;
+       u32 table_size;
+
+       if (!device->create_rwq_ind_table)
+               return ERR_PTR(-ENOSYS);
+
+       table_size = (1 << init_attr->log_ind_tbl_size);
+       rwq_ind_table = device->create_rwq_ind_table(device,
+                               init_attr, NULL);
+       if (IS_ERR(rwq_ind_table))
+               return rwq_ind_table;
+
+       rwq_ind_table->ind_tbl = init_attr->ind_tbl;
+       rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size;
+       rwq_ind_table->device = device;
+       rwq_ind_table->uobject = NULL;
+       atomic_set(&rwq_ind_table->usecnt, 0);
+
+       for (i = 0; i < table_size; i++)
+               atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt);
+
+       return rwq_ind_table;
+}
+EXPORT_SYMBOL(ib_create_rwq_ind_table);
+
+/*
+ * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table.
+ * @wq_ind_table: The Indirection Table to destroy.
+*/
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
+{
+       int err, i;
+       u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size);
+       struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl;
+
+       if (atomic_read(&rwq_ind_table->usecnt))
+               return -EBUSY;
+
+       err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+       if (!err) {
+               for (i = 0; i < table_size; i++)
+                       atomic_dec(&ind_tbl[i]->usecnt);
+       }
+
+       return err;
+}
+EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
+
 struct ib_flow *ib_create_flow(struct ib_qp *qp,
                               struct ib_flow_attr *flow_attr,
                               int domain)
index 3e8431b..04bbf17 100644 (file)
@@ -1396,10 +1396,10 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
        state_set(&child_ep->com, CONNECTING);
        child_ep->com.tdev = tdev;
        child_ep->com.cm_id = NULL;
-       child_ep->com.local_addr.sin_family = PF_INET;
+       child_ep->com.local_addr.sin_family = AF_INET;
        child_ep->com.local_addr.sin_port = req->local_port;
        child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
-       child_ep->com.remote_addr.sin_family = PF_INET;
+       child_ep->com.remote_addr.sin_family = AF_INET;
        child_ep->com.remote_addr.sin_port = req->peer_port;
        child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
        get_ep(&parent_ep->com);
index bb1a839..3edb806 100644 (file)
@@ -1183,18 +1183,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
        return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
 }
 
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-                                                ibdev.dev);
-       struct ethtool_drvinfo info;
-       struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
-
-       PDBG("%s dev 0x%p\n", __func__, dev);
-       lldev->ethtool_ops->get_drvinfo(lldev, &info);
-       return sprintf(buf, "%s\n", info.fw_version);
-}
-
 static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
                        char *buf)
 {
@@ -1334,13 +1322,11 @@ static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
 
 static struct device_attribute *iwch_class_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type,
        &dev_attr_board_id,
 };
@@ -1362,6 +1348,18 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str,
+                              size_t str_len)
+{
+       struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
+       struct ethtool_drvinfo info;
+       struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+       PDBG("%s dev 0x%p\n", __func__, iwch_dev);
+       lldev->ethtool_ops->get_drvinfo(lldev, &info);
+       snprintf(str, str_len, "%s", info.fw_version);
+}
+
 int iwch_register_device(struct iwch_dev *dev)
 {
        int ret;
@@ -1437,6 +1435,7 @@ int iwch_register_device(struct iwch_dev *dev)
        dev->ibdev.get_hw_stats = iwch_get_mib;
        dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
        dev->ibdev.get_port_immutable = iwch_port_immutable;
+       dev->ibdev.get_dev_fw_str = get_dev_fw_ver_str;
 
        dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
        if (!dev->ibdev.iwcm)
index a3a6721..3aca7f6 100644 (file)
@@ -294,6 +294,25 @@ static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
        return;
 }
 
+static int alloc_ep_skb_list(struct sk_buff_head *ep_skb_list, int size)
+{
+       struct sk_buff *skb;
+       unsigned int i;
+       size_t len;
+
+       len = roundup(sizeof(union cpl_wr_size), 16);
+       for (i = 0; i < size; i++) {
+               skb = alloc_skb(len, GFP_KERNEL);
+               if (!skb)
+                       goto fail;
+               skb_queue_tail(ep_skb_list, skb);
+       }
+       return 0;
+fail:
+       skb_queue_purge(ep_skb_list);
+       return -ENOMEM;
+}
+
 static void *alloc_ep(int size, gfp_t gfp)
 {
        struct c4iw_ep_common *epc;
@@ -384,6 +403,8 @@ void _c4iw_free_ep(struct kref *kref)
                if (ep->mpa_skb)
                        kfree_skb(ep->mpa_skb);
        }
+       if (!skb_queue_empty(&ep->com.ep_skb_list))
+               skb_queue_purge(&ep->com.ep_skb_list);
        kfree(ep);
 }
 
@@ -620,25 +641,27 @@ static void abort_arp_failure(void *handle, struct sk_buff *skb)
        }
 }
 
-static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+static int send_flowc(struct c4iw_ep *ep)
 {
-       unsigned int flowclen = 80;
        struct fw_flowc_wr *flowc;
+       struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
        int i;
        u16 vlan = ep->l2t->vlan;
        int nparams;
 
+       if (WARN_ON(!skb))
+               return -ENOMEM;
+
        if (vlan == CPL_L2T_VLAN_NONE)
                nparams = 8;
        else
                nparams = 9;
 
-       skb = get_skb(skb, flowclen, GFP_KERNEL);
-       flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+       flowc = (struct fw_flowc_wr *)__skb_put(skb, FLOWC_LEN);
 
        flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
                                           FW_FLOWC_WR_NPARAMS_V(nparams));
-       flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(flowclen,
+       flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(FLOWC_LEN,
                                          16)) | FW_WR_FLOWID_V(ep->hwtid));
 
        flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
@@ -679,18 +702,16 @@ static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
        return c4iw_ofld_send(&ep->com.dev->rdev, skb);
 }
 
-static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
+static int send_halfclose(struct c4iw_ep *ep)
 {
        struct cpl_close_con_req *req;
-       struct sk_buff *skb;
+       struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
        int wrlen = roundup(sizeof *req, 16);
 
        PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-       skb = get_skb(NULL, wrlen, gfp);
-       if (!skb) {
-               printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+       if (WARN_ON(!skb))
                return -ENOMEM;
-       }
+
        set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
        t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
        req = (struct cpl_close_con_req *) skb_put(skb, wrlen);
@@ -701,26 +722,24 @@ static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
        return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
 
-static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+static int send_abort(struct c4iw_ep *ep)
 {
        struct cpl_abort_req *req;
        int wrlen = roundup(sizeof *req, 16);
+       struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list);
 
        PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-       skb = get_skb(skb, wrlen, gfp);
-       if (!skb) {
-               printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
-                      __func__);
+       if (WARN_ON(!req_skb))
                return -ENOMEM;
-       }
-       set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
-       t4_set_arp_err_handler(skb, ep, abort_arp_failure);
-       req = (struct cpl_abort_req *) skb_put(skb, wrlen);
+
+       set_wr_txq(req_skb, CPL_PRIORITY_DATA, ep->txq_idx);
+       t4_set_arp_err_handler(req_skb, ep, abort_arp_failure);
+       req = (struct cpl_abort_req *)skb_put(req_skb, wrlen);
        memset(req, 0, wrlen);
        INIT_TP_WR(req, ep->hwtid);
        OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
        req->cmd = CPL_ABORT_SEND_RST;
-       return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
 }
 
 static void best_mtu(const unsigned short *mtus, unsigned short mtu,
@@ -992,9 +1011,19 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
 
        mpa = (struct mpa_message *)(req + 1);
        memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
-       mpa->flags = (crc_enabled ? MPA_CRC : 0) |
-                    (markers_enabled ? MPA_MARKERS : 0) |
-                    (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0);
+
+       mpa->flags = 0;
+       if (crc_enabled)
+               mpa->flags |= MPA_CRC;
+       if (markers_enabled) {
+               mpa->flags |= MPA_MARKERS;
+               ep->mpa_attr.recv_marker_enabled = 1;
+       } else {
+               ep->mpa_attr.recv_marker_enabled = 0;
+       }
+       if (mpa_rev_to_use == 2)
+               mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+
        mpa->private_data_size = htons(ep->plen);
        mpa->revision = mpa_rev_to_use;
        if (mpa_rev_to_use == 1) {
@@ -1169,8 +1198,11 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
        mpa = (struct mpa_message *)(req + 1);
        memset(mpa, 0, sizeof(*mpa));
        memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
-       mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
-                    (markers_enabled ? MPA_MARKERS : 0);
+       mpa->flags = 0;
+       if (ep->mpa_attr.crc_enabled)
+               mpa->flags |= MPA_CRC;
+       if (ep->mpa_attr.recv_marker_enabled)
+               mpa->flags |= MPA_MARKERS;
        mpa->revision = ep->mpa_attr.version;
        mpa->private_data_size = htons(plen);
 
@@ -1248,7 +1280,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
        set_bit(ACT_ESTAB, &ep->com.history);
 
        /* start MPA negotiation */
-       ret = send_flowc(ep, NULL);
+       ret = send_flowc(ep);
        if (ret)
                goto err;
        if (ep->retry_with_mpa_v1)
@@ -1555,7 +1587,6 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
         */
        __state_set(&ep->com, FPDU_MODE);
        ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
-       ep->mpa_attr.recv_marker_enabled = markers_enabled;
        ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
        ep->mpa_attr.version = mpa->revision;
        ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
@@ -2004,12 +2035,17 @@ static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
 }
 
 /*
- * Return whether a failed active open has allocated a TID
+ * Some of the error codes above implicitly indicate that there is no TID
+ * allocated with the result of an ACT_OPEN.  We use this predicate to make
+ * that explicit.
  */
 static inline int act_open_has_tid(int status)
 {
-       return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
-              status != CPL_ERR_ARP_MISS;
+       return (status != CPL_ERR_TCAM_PARITY &&
+               status != CPL_ERR_TCAM_MISS &&
+               status != CPL_ERR_TCAM_FULL &&
+               status != CPL_ERR_CONN_EXIST_SYNRECV &&
+               status != CPL_ERR_CONN_EXIST);
 }
 
 /* Returns whether a CPL status conveys negative advice.
@@ -2130,6 +2166,7 @@ out:
 static int c4iw_reconnect(struct c4iw_ep *ep)
 {
        int err = 0;
+       int size = 0;
        struct sockaddr_in *laddr = (struct sockaddr_in *)
                                    &ep->com.cm_id->m_local_addr;
        struct sockaddr_in *raddr = (struct sockaddr_in *)
@@ -2145,6 +2182,21 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
        init_timer(&ep->timer);
        c4iw_init_wr_wait(&ep->com.wr_wait);
 
+       /* When MPA revision is different on nodes, the node with MPA_rev=2
+        * tries to reconnect with MPA_rev 1 for the same EP through
+        * c4iw_reconnect(), where the same EP is assigned with new tid for
+        * further connection establishment. As we are using the same EP pointer
+        * for reconnect, few skbs are used during the previous c4iw_connect(),
+        * which leaves the EP with inadequate skbs for further
+        * c4iw_reconnect(), Further causing an assert BUG_ON() due to empty
+        * skb_list() during peer_abort(). Allocate skbs which is already used.
+        */
+       size = (CN_MAX_CON_BUF - skb_queue_len(&ep->com.ep_skb_list));
+       if (alloc_ep_skb_list(&ep->com.ep_skb_list, size)) {
+               err = -ENOMEM;
+               goto fail1;
+       }
+
        /*
         * Allocate an active TID to initiate a TCP connection.
         */
@@ -2210,6 +2262,7 @@ fail2:
         * response of 1st connect request.
         */
        connect_reply_upcall(ep, -ECONNRESET);
+fail1:
        c4iw_put_ep(&ep->com);
 out:
        return err;
@@ -2576,6 +2629,10 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
        if (peer_mss && child_ep->mtu > (peer_mss + hdrs))
                child_ep->mtu = peer_mss + hdrs;
 
+       skb_queue_head_init(&child_ep->com.ep_skb_list);
+       if (alloc_ep_skb_list(&child_ep->com.ep_skb_list, CN_MAX_CON_BUF))
+               goto fail;
+
        state_set(&child_ep->com, CONNECTING);
        child_ep->com.dev = dev;
        child_ep->com.cm_id = NULL;
@@ -2640,6 +2697,8 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
                               (const u32 *)&sin6->sin6_addr.s6_addr, 1);
        }
        goto out;
+fail:
+       c4iw_put_ep(&child_ep->com);
 reject:
        reject_cr(dev, hwtid, skb);
        if (parent_ep)
@@ -2670,7 +2729,7 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
        ep->com.state = MPA_REQ_WAIT;
        start_ep_timer(ep);
        set_bit(PASS_ESTAB, &ep->com.history);
-       ret = send_flowc(ep, skb);
+       ret = send_flowc(ep);
        mutex_unlock(&ep->com.mutex);
        if (ret)
                c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
@@ -2871,10 +2930,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
        }
        mutex_unlock(&ep->com.mutex);
 
-       rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
-       if (!rpl_skb) {
-               printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
-                      __func__);
+       rpl_skb = skb_dequeue(&ep->com.ep_skb_list);
+       if (WARN_ON(!rpl_skb)) {
                release = 1;
                goto out;
        }
@@ -3011,9 +3068,9 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
                PDBG("%s last streaming msg ack ep %p tid %u state %u "
                     "initiator %u freeing skb\n", __func__, ep, ep->hwtid,
                     state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0);
+               mutex_lock(&ep->com.mutex);
                kfree_skb(ep->mpa_skb);
                ep->mpa_skb = NULL;
-               mutex_lock(&ep->com.mutex);
                if (test_bit(STOP_MPA_TIMER, &ep->com.flags))
                        stop_ep_timer(ep);
                mutex_unlock(&ep->com.mutex);
@@ -3025,9 +3082,9 @@ out:
 
 int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
 {
-       int err = 0;
-       int disconnect = 0;
+       int abort;
        struct c4iw_ep *ep = to_ep(cm_id);
+
        PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
 
        mutex_lock(&ep->com.mutex);
@@ -3038,16 +3095,13 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
        }
        set_bit(ULP_REJECT, &ep->com.history);
        if (mpa_rev == 0)
-               disconnect = 2;
-       else {
-               err = send_mpa_reject(ep, pdata, pdata_len);
-               disconnect = 1;
-       }
+               abort = 1;
+       else
+               abort = send_mpa_reject(ep, pdata, pdata_len);
        mutex_unlock(&ep->com.mutex);
-       if (disconnect) {
-               stop_ep_timer(ep);
-               err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
-       }
+
+       stop_ep_timer(ep);
+       c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL);
        c4iw_put_ep(&ep->com);
        return 0;
 }
@@ -3248,6 +3302,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                err = -ENOMEM;
                goto out;
        }
+
+       skb_queue_head_init(&ep->com.ep_skb_list);
+       if (alloc_ep_skb_list(&ep->com.ep_skb_list, CN_MAX_CON_BUF)) {
+               err = -ENOMEM;
+               goto fail1;
+       }
+
        init_timer(&ep->timer);
        ep->plen = conn_param->private_data_len;
        if (ep->plen)
@@ -3266,7 +3327,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
        if (!ep->com.qp) {
                PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);
                err = -EINVAL;
-               goto fail1;
+               goto fail2;
        }
        ref_qp(ep);
        PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
@@ -3279,7 +3340,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
        if (ep->atid == -1) {
                printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
                err = -ENOMEM;
-               goto fail1;
+               goto fail2;
        }
        insert_handle(dev, &dev->atid_idr, ep, ep->atid);
 
@@ -3303,7 +3364,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) {
                        err = pick_local_ipaddrs(dev, cm_id);
                        if (err)
-                               goto fail1;
+                               goto fail2;
                }
 
                /* find a route */
@@ -3323,7 +3384,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) {
                        err = pick_local_ip6addrs(dev, cm_id);
                        if (err)
-                               goto fail1;
+                               goto fail2;
                }
 
                /* find a route */
@@ -3339,14 +3400,14 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
        if (!ep->dst) {
                printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
                err = -EHOSTUNREACH;
-               goto fail2;
+               goto fail3;
        }
 
        err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true,
                        ep->com.dev->rdev.lldi.adapter_type, cm_id->tos);
        if (err) {
                printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
-               goto fail3;
+               goto fail4;
        }
 
        PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
@@ -3362,13 +3423,15 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                goto out;
 
        cxgb4_l2t_release(ep->l2t);
-fail3:
+fail4:
        dst_release(ep->dst);
-fail2:
+fail3:
        remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
        cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
-fail1:
+fail2:
+       skb_queue_purge(&ep->com.ep_skb_list);
        deref_cm_id(&ep->com);
+fail1:
        c4iw_put_ep(&ep->com);
 out:
        return err;
@@ -3461,6 +3524,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
                err = -ENOMEM;
                goto fail1;
        }
+       skb_queue_head_init(&ep->com.ep_skb_list);
        PDBG("%s ep %p\n", __func__, ep);
        ep->com.cm_id = cm_id;
        ref_cm_id(&ep->com);
@@ -3577,11 +3641,22 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
        case MPA_REQ_RCVD:
        case MPA_REP_SENT:
        case FPDU_MODE:
+       case CONNECTING:
                close = 1;
                if (abrupt)
                        ep->com.state = ABORTING;
                else {
                        ep->com.state = CLOSING;
+
+                       /*
+                        * if we close before we see the fw4_ack() then we fix
+                        * up the timer state since we're reusing it.
+                        */
+                       if (ep->mpa_skb &&
+                           test_bit(STOP_MPA_TIMER, &ep->com.flags)) {
+                               clear_bit(STOP_MPA_TIMER, &ep->com.flags);
+                               stop_ep_timer(ep);
+                       }
                        start_ep_timer(ep);
                }
                set_bit(CLOSE_SENT, &ep->com.flags);
@@ -3611,10 +3686,10 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
                if (abrupt) {
                        set_bit(EP_DISC_ABORT, &ep->com.history);
                        close_complete_upcall(ep, -ECONNRESET);
-                       ret = send_abort(ep, NULL, gfp);
+                       ret = send_abort(ep);
                } else {
                        set_bit(EP_DISC_CLOSE, &ep->com.history);
-                       ret = send_halfclose(ep, gfp);
+                       ret = send_halfclose(ep);
                }
                if (ret) {
                        set_bit(EP_DISC_FAIL, &ep->com.history);
index b0b9557..812ab72 100644 (file)
 #include "iw_cxgb4.h"
 
 static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
-                     struct c4iw_dev_ucontext *uctx)
+                     struct c4iw_dev_ucontext *uctx, struct sk_buff *skb)
 {
        struct fw_ri_res_wr *res_wr;
        struct fw_ri_res *res;
        int wr_len;
        struct c4iw_wr_wait wr_wait;
-       struct sk_buff *skb;
        int ret;
 
        wr_len = sizeof *res_wr + sizeof *res;
-       skb = alloc_skb(wr_len, GFP_KERNEL);
-       if (!skb)
-               return -ENOMEM;
        set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
 
        res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
@@ -863,7 +859,9 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq)
        ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
                                  : NULL;
        destroy_cq(&chp->rhp->rdev, &chp->cq,
-                  ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx);
+                  ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
+                  chp->destroy_skb);
+       chp->destroy_skb = NULL;
        kfree(chp);
        return 0;
 }
@@ -879,7 +877,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        struct c4iw_cq *chp;
        struct c4iw_create_cq_resp uresp;
        struct c4iw_ucontext *ucontext = NULL;
-       int ret;
+       int ret, wr_len;
        size_t memsize, hwentries;
        struct c4iw_mm_entry *mm, *mm2;
 
@@ -896,6 +894,13 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        if (!chp)
                return ERR_PTR(-ENOMEM);
 
+       wr_len = sizeof(struct fw_ri_res_wr) + sizeof(struct fw_ri_res);
+       chp->destroy_skb = alloc_skb(wr_len, GFP_KERNEL);
+       if (!chp->destroy_skb) {
+               ret = -ENOMEM;
+               goto err1;
+       }
+
        if (ib_context)
                ucontext = to_c4iw_ucontext(ib_context);
 
@@ -936,7 +941,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        ret = create_cq(&rhp->rdev, &chp->cq,
                        ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
        if (ret)
-               goto err1;
+               goto err2;
 
        chp->rhp = rhp;
        chp->cq.size--;                         /* status page */
@@ -947,15 +952,15 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        init_waitqueue_head(&chp->wait);
        ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
        if (ret)
-               goto err2;
+               goto err3;
 
        if (ucontext) {
                mm = kmalloc(sizeof *mm, GFP_KERNEL);
                if (!mm)
-                       goto err3;
+                       goto err4;
                mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
                if (!mm2)
-                       goto err4;
+                       goto err5;
 
                uresp.qid_mask = rhp->rdev.cqmask;
                uresp.cqid = chp->cq.cqid;
@@ -970,7 +975,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
                ret = ib_copy_to_udata(udata, &uresp,
                                       sizeof(uresp) - sizeof(uresp.reserved));
                if (ret)
-                       goto err5;
+                       goto err6;
 
                mm->key = uresp.key;
                mm->addr = virt_to_phys(chp->cq.queue);
@@ -986,15 +991,18 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
             __func__, chp->cq.cqid, chp, chp->cq.size,
             chp->cq.memsize, (unsigned long long) chp->cq.dma_addr);
        return &chp->ibcq;
-err5:
+err6:
        kfree(mm2);
-err4:
+err5:
        kfree(mm);
-err3:
+err4:
        remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
-err2:
+err3:
        destroy_cq(&chp->rhp->rdev, &chp->cq,
-                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
+                  chp->destroy_skb);
+err2:
+       kfree_skb(chp->destroy_skb);
 err1:
        kfree(chp);
        return ERR_PTR(ret);
index ae2e8b2..071d733 100644 (file)
@@ -317,7 +317,7 @@ static int qp_open(struct inode *inode, struct file *file)
        idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
        spin_unlock_irq(&qpd->devp->lock);
 
-       qpd->bufsize = count * 128;
+       qpd->bufsize = count * 180;
        qpd->buf = vmalloc(qpd->bufsize);
        if (!qpd->buf) {
                kfree(qpd);
index f6f34a7..aa47e0a 100644 (file)
@@ -384,6 +384,7 @@ struct c4iw_mr {
        struct ib_mr ibmr;
        struct ib_umem *umem;
        struct c4iw_dev *rhp;
+       struct sk_buff *dereg_skb;
        u64 kva;
        struct tpt_attributes attr;
        u64 *mpl;
@@ -400,6 +401,7 @@ static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr)
 struct c4iw_mw {
        struct ib_mw ibmw;
        struct c4iw_dev *rhp;
+       struct sk_buff *dereg_skb;
        u64 kva;
        struct tpt_attributes attr;
 };
@@ -412,6 +414,7 @@ static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw)
 struct c4iw_cq {
        struct ib_cq ibcq;
        struct c4iw_dev *rhp;
+       struct sk_buff *destroy_skb;
        struct t4_cq cq;
        spinlock_t lock;
        spinlock_t comp_handler_lock;
@@ -472,7 +475,7 @@ struct c4iw_qp {
        struct t4_wq wq;
        spinlock_t lock;
        struct mutex mutex;
-       atomic_t refcnt;
+       struct kref kref;
        wait_queue_head_t wait;
        struct timer_list timer;
        int sq_sig_all;
@@ -789,10 +792,29 @@ enum c4iw_ep_history {
        CM_ID_DEREFED           = 28,
 };
 
+enum conn_pre_alloc_buffers {
+       CN_ABORT_REQ_BUF,
+       CN_ABORT_RPL_BUF,
+       CN_CLOSE_CON_REQ_BUF,
+       CN_DESTROY_BUF,
+       CN_FLOWC_BUF,
+       CN_MAX_CON_BUF
+};
+
+#define FLOWC_LEN 80
+union cpl_wr_size {
+       struct cpl_abort_req abrt_req;
+       struct cpl_abort_rpl abrt_rpl;
+       struct fw_ri_wr ri_req;
+       struct cpl_close_con_req close_req;
+       char flowc_buf[FLOWC_LEN];
+};
+
 struct c4iw_ep_common {
        struct iw_cm_id *cm_id;
        struct c4iw_qp *qp;
        struct c4iw_dev *dev;
+       struct sk_buff_head ep_skb_list;
        enum c4iw_ep_state state;
        struct kref kref;
        struct mutex mutex;
index 55d0651..0b91b0f 100644 (file)
@@ -59,9 +59,9 @@ static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
 }
 
 static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
-                                      u32 len, dma_addr_t data, int wait)
+                                      u32 len, dma_addr_t data,
+                                      int wait, struct sk_buff *skb)
 {
-       struct sk_buff *skb;
        struct ulp_mem_io *req;
        struct ulptx_sgl *sgl;
        u8 wr_len;
@@ -74,9 +74,11 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
                c4iw_init_wr_wait(&wr_wait);
        wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16);
 
-       skb = alloc_skb(wr_len, GFP_KERNEL);
-       if (!skb)
-               return -ENOMEM;
+       if (!skb) {
+               skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+               if (!skb)
+                       return -ENOMEM;
+       }
        set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
 
        req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
@@ -108,9 +110,8 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
 }
 
 static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
-                                 void *data)
+                                 void *data, struct sk_buff *skb)
 {
-       struct sk_buff *skb;
        struct ulp_mem_io *req;
        struct ulptx_idata *sc;
        u8 wr_len, *to_dp, *from_dp;
@@ -134,9 +135,11 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
                wr_len = roundup(sizeof *req + sizeof *sc +
                                 roundup(copy_len, T4_ULPTX_MIN_IO), 16);
 
-               skb = alloc_skb(wr_len, GFP_KERNEL);
-               if (!skb)
-                       return -ENOMEM;
+               if (!skb) {
+                       skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+                       if (!skb)
+                               return -ENOMEM;
+               }
                set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
 
                req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
@@ -173,6 +176,7 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
                        memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO -
                               (copy_len % T4_ULPTX_MIN_IO));
                ret = c4iw_ofld_send(rdev, skb);
+               skb = NULL;
                if (ret)
                        return ret;
                len -= C4IW_MAX_INLINE_SIZE;
@@ -182,7 +186,8 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
        return ret;
 }
 
-static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data)
+static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len,
+                              void *data, struct sk_buff *skb)
 {
        u32 remain = len;
        u32 dmalen;
@@ -205,7 +210,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *
                        dmalen = T4_ULPTX_MAX_DMA;
                remain -= dmalen;
                ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr,
-                                                !remain);
+                                                !remain, skb);
                if (ret)
                        goto out;
                addr += dmalen >> 5;
@@ -213,7 +218,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *
                daddr += dmalen;
        }
        if (remain)
-               ret = _c4iw_write_mem_inline(rdev, addr, remain, data);
+               ret = _c4iw_write_mem_inline(rdev, addr, remain, data, skb);
 out:
        dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE);
        return ret;
@@ -224,23 +229,25 @@ out:
  * If data is NULL, clear len byte of memory to zero.
  */
 static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len,
-                            void *data)
+                            void *data, struct sk_buff *skb)
 {
        if (is_t5(rdev->lldi.adapter_type) && use_dsgl) {
                if (len > inline_threshold) {
-                       if (_c4iw_write_mem_dma(rdev, addr, len, data)) {
+                       if (_c4iw_write_mem_dma(rdev, addr, len, data, skb)) {
                                printk_ratelimited(KERN_WARNING
                                                   "%s: dma map"
                                                   " failure (non fatal)\n",
                                                   pci_name(rdev->lldi.pdev));
                                return _c4iw_write_mem_inline(rdev, addr, len,
-                                                             data);
-                       } else
+                                                             data, skb);
+                       } else {
                                return 0;
+                       }
                } else
-                       return _c4iw_write_mem_inline(rdev, addr, len, data);
+                       return _c4iw_write_mem_inline(rdev, addr,
+                                                     len, data, skb);
        } else
-               return _c4iw_write_mem_inline(rdev, addr, len, data);
+               return _c4iw_write_mem_inline(rdev, addr, len, data, skb);
 }
 
 /*
@@ -253,7 +260,8 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
                           u32 *stag, u8 stag_state, u32 pdid,
                           enum fw_ri_stag_type type, enum fw_ri_mem_perms perm,
                           int bind_enabled, u32 zbva, u64 to,
-                          u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr)
+                          u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr,
+                          struct sk_buff *skb)
 {
        int err;
        struct fw_ri_tpte tpt;
@@ -307,7 +315,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
        }
        err = write_adapter_mem(rdev, stag_idx +
                                (rdev->lldi.vr->stag.start >> 5),
-                               sizeof(tpt), &tpt);
+                               sizeof(tpt), &tpt, skb);
 
        if (reset_tpt_entry) {
                c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
@@ -327,28 +335,29 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
             __func__, pbl_addr, rdev->lldi.vr->pbl.start,
             pbl_size);
 
-       err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl);
+       err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl, NULL);
        return err;
 }
 
 static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size,
-                    u32 pbl_addr)
+                    u32 pbl_addr, struct sk_buff *skb)
 {
        return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0,
-                              pbl_size, pbl_addr);
+                              pbl_size, pbl_addr, skb);
 }
 
 static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid)
 {
        *stag = T4_STAG_UNSET;
        return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0,
-                              0UL, 0, 0, 0, 0);
+                              0UL, 0, 0, 0, 0, NULL);
 }
 
-static int deallocate_window(struct c4iw_rdev *rdev, u32 stag)
+static int deallocate_window(struct c4iw_rdev *rdev, u32 stag,
+                            struct sk_buff *skb)
 {
        return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0,
-                              0);
+                              0, skb);
 }
 
 static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
@@ -356,7 +365,7 @@ static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
 {
        *stag = T4_STAG_UNSET;
        return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0,
-                              0UL, 0, 0, pbl_size, pbl_addr);
+                              0UL, 0, 0, pbl_size, pbl_addr, NULL);
 }
 
 static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
@@ -383,14 +392,16 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
                              mhp->attr.mw_bind_enable, mhp->attr.zbva,
                              mhp->attr.va_fbo, mhp->attr.len ?
                              mhp->attr.len : -1, shift - 12,
-                             mhp->attr.pbl_size, mhp->attr.pbl_addr);
+                             mhp->attr.pbl_size, mhp->attr.pbl_addr, NULL);
        if (ret)
                return ret;
 
        ret = finish_mem_reg(mhp, stag);
-       if (ret)
+       if (ret) {
                dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
+                         mhp->attr.pbl_addr, mhp->dereg_skb);
+               mhp->dereg_skb = NULL;
+       }
        return ret;
 }
 
@@ -423,6 +434,12 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
        if (!mhp)
                return ERR_PTR(-ENOMEM);
 
+       mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+       if (!mhp->dereg_skb) {
+               ret = -ENOMEM;
+               goto err0;
+       }
+
        mhp->rhp = rhp;
        mhp->attr.pdid = php->pdid;
        mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
@@ -435,7 +452,8 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
 
        ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid,
                              FW_RI_STAG_NSMR, mhp->attr.perms,
-                             mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0);
+                             mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0,
+                             NULL);
        if (ret)
                goto err1;
 
@@ -445,8 +463,10 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
        return &mhp->ibmr;
 err2:
        dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                 mhp->attr.pbl_addr);
+                 mhp->attr.pbl_addr, mhp->dereg_skb);
 err1:
+       kfree_skb(mhp->dereg_skb);
+err0:
        kfree(mhp);
        return ERR_PTR(ret);
 }
@@ -481,11 +501,18 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        if (!mhp)
                return ERR_PTR(-ENOMEM);
 
+       mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+       if (!mhp->dereg_skb) {
+               kfree(mhp);
+               return ERR_PTR(-ENOMEM);
+       }
+
        mhp->rhp = rhp;
 
        mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
        if (IS_ERR(mhp->umem)) {
                err = PTR_ERR(mhp->umem);
+               kfree_skb(mhp->dereg_skb);
                kfree(mhp);
                return ERR_PTR(err);
        }
@@ -550,6 +577,7 @@ err_pbl:
 
 err:
        ib_umem_release(mhp->umem);
+       kfree_skb(mhp->dereg_skb);
        kfree(mhp);
        return ERR_PTR(err);
 }
@@ -572,11 +600,16 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
        mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
        if (!mhp)
                return ERR_PTR(-ENOMEM);
-       ret = allocate_window(&rhp->rdev, &stag, php->pdid);
-       if (ret) {
-               kfree(mhp);
-               return ERR_PTR(ret);
+
+       mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+       if (!mhp->dereg_skb) {
+               ret = -ENOMEM;
+               goto free_mhp;
        }
+
+       ret = allocate_window(&rhp->rdev, &stag, php->pdid);
+       if (ret)
+               goto free_skb;
        mhp->rhp = rhp;
        mhp->attr.pdid = php->pdid;
        mhp->attr.type = FW_RI_STAG_MW;
@@ -584,12 +617,19 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
        mmid = (stag) >> 8;
        mhp->ibmw.rkey = stag;
        if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
-               deallocate_window(&rhp->rdev, mhp->attr.stag);
-               kfree(mhp);
-               return ERR_PTR(-ENOMEM);
+               ret = -ENOMEM;
+               goto dealloc_win;
        }
        PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
        return &(mhp->ibmw);
+
+dealloc_win:
+       deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb);
+free_skb:
+       kfree_skb(mhp->dereg_skb);
+free_mhp:
+       kfree(mhp);
+       return ERR_PTR(ret);
 }
 
 int c4iw_dealloc_mw(struct ib_mw *mw)
@@ -602,7 +642,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
        rhp = mhp->rhp;
        mmid = (mw->rkey) >> 8;
        remove_handle(rhp, &rhp->mmidr, mmid);
-       deallocate_window(&rhp->rdev, mhp->attr.stag);
+       deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb);
+       kfree_skb(mhp->dereg_skb);
        kfree(mhp);
        PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
        return 0;
@@ -666,7 +707,7 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
        return &(mhp->ibmr);
 err3:
        dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
+                 mhp->attr.pbl_addr, mhp->dereg_skb);
 err2:
        c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
                              mhp->attr.pbl_size << 3);
@@ -717,7 +758,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
                dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev,
                                  mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr);
        dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
+                 mhp->attr.pbl_addr, mhp->dereg_skb);
        if (mhp->attr.pbl_size)
                c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
                                  mhp->attr.pbl_size << 3);
index dd8a86b..df127ce 100644 (file)
@@ -409,20 +409,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
                       CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
 }
 
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
-                          char *buf)
-{
-       struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-                                                ibdev.dev);
-       PDBG("%s dev 0x%p\n", __func__, dev);
-
-       return sprintf(buf, "%u.%u.%u.%u\n",
-                       FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
-                       FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
-                       FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
-                       FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
-}
-
 static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
                        char *buf)
 {
@@ -502,13 +488,11 @@ static int c4iw_get_mib(struct ib_device *ibdev,
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
 
 static struct device_attribute *c4iw_class_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type,
        &dev_attr_board_id,
 };
@@ -530,6 +514,20 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void get_dev_fw_str(struct ib_device *dev, char *str,
+                          size_t str_len)
+{
+       struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+                                                ibdev);
+       PDBG("%s dev 0x%p\n", __func__, dev);
+
+       snprintf(str, str_len, "%u.%u.%u.%u",
+                FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
+                FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
+                FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
+                FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
+}
+
 int c4iw_register_device(struct c4iw_dev *dev)
 {
        int ret;
@@ -605,6 +603,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
        dev->ibdev.get_hw_stats = c4iw_get_mib;
        dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
        dev->ibdev.get_port_immutable = c4iw_port_immutable;
+       dev->ibdev.get_dev_fw_str = get_dev_fw_str;
        dev->ibdev.drain_sq = c4iw_drain_sq;
        dev->ibdev.drain_rq = c4iw_drain_rq;
 
index e8993e4..edb1172 100644 (file)
@@ -683,17 +683,25 @@ static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
        return 0;
 }
 
+void _free_qp(struct kref *kref)
+{
+       struct c4iw_qp *qhp;
+
+       qhp = container_of(kref, struct c4iw_qp, kref);
+       PDBG("%s qhp %p\n", __func__, qhp);
+       kfree(qhp);
+}
+
 void c4iw_qp_add_ref(struct ib_qp *qp)
 {
        PDBG("%s ib_qp %p\n", __func__, qp);
-       atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+       kref_get(&to_c4iw_qp(qp)->kref);
 }
 
 void c4iw_qp_rem_ref(struct ib_qp *qp)
 {
        PDBG("%s ib_qp %p\n", __func__, qp);
-       if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
-               wake_up(&(to_c4iw_qp(qp)->wait));
+       kref_put(&to_c4iw_qp(qp)->kref, _free_qp);
 }
 
 static void add_to_fc_list(struct list_head *head, struct list_head *entry)
@@ -1081,9 +1089,10 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
        PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
             qhp->ep->hwtid);
 
-       skb = alloc_skb(sizeof *wqe, gfp);
-       if (!skb)
+       skb = skb_dequeue(&qhp->ep->com.ep_skb_list);
+       if (WARN_ON(!skb))
                return;
+
        set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
 
        wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
@@ -1202,9 +1211,10 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
        PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
             ep->hwtid);
 
-       skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
-       if (!skb)
+       skb = skb_dequeue(&ep->com.ep_skb_list);
+       if (WARN_ON(!skb))
                return -ENOMEM;
+
        set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
 
        wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
@@ -1592,8 +1602,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
        wait_event(qhp->wait, !qhp->ep);
 
        remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
-       atomic_dec(&qhp->refcnt);
-       wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
 
        spin_lock_irq(&rhp->lock);
        if (!list_empty(&qhp->db_fc_entry))
@@ -1606,8 +1614,9 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
        destroy_qp(&rhp->rdev, &qhp->wq,
                   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 
+       c4iw_qp_rem_ref(ib_qp);
+
        PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
-       kfree(qhp);
        return 0;
 }
 
@@ -1704,7 +1713,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
        init_completion(&qhp->rq_drained);
        mutex_init(&qhp->mutex);
        init_waitqueue_head(&qhp->wait);
-       atomic_set(&qhp->refcnt, 1);
+       kref_init(&qhp->kref);
 
        ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
        if (ret)
@@ -1896,12 +1905,20 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        return 0;
 }
 
+static void move_qp_to_err(struct c4iw_qp *qp)
+{
+       struct c4iw_qp_attributes attrs = { .next_state = C4IW_QP_STATE_ERROR };
+
+       (void)c4iw_modify_qp(qp->rhp, qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+}
+
 void c4iw_drain_sq(struct ib_qp *ibqp)
 {
        struct c4iw_qp *qp = to_c4iw_qp(ibqp);
        unsigned long flag;
        bool need_to_wait;
 
+       move_qp_to_err(qp);
        spin_lock_irqsave(&qp->lock, flag);
        need_to_wait = !t4_sq_empty(&qp->wq);
        spin_unlock_irqrestore(&qp->lock, flag);
@@ -1916,6 +1933,7 @@ void c4iw_drain_rq(struct ib_qp *ibqp)
        unsigned long flag;
        bool need_to_wait;
 
+       move_qp_to_err(qp);
        spin_lock_irqsave(&qp->lock, flag);
        need_to_wait = !t4_rq_empty(&qp->wq);
        spin_unlock_irqrestore(&qp->lock, flag);
index a925fb0..f846fd5 100644 (file)
@@ -3,7 +3,6 @@ config INFINIBAND_HFI1
        depends on X86_64 && INFINIBAND_RDMAVT
        select MMU_NOTIFIER
        select CRC32
-       default m
        ---help---
        This is a low-level driver for Intel OPA Gen1 adapter.
 config HFI1_DEBUG_SDMA_ORDER
index c702a00..32c19fa 100644 (file)
@@ -228,7 +228,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
                                    sizeof(struct hfi1_base_info));
                break;
        case HFI1_IOCTL_CREDIT_UPD:
-               if (uctxt && uctxt->sc)
+               if (uctxt)
                        sc_return_credits(uctxt->sc);
                break;
 
index 4417a0f..49a71e2 100644 (file)
@@ -1174,6 +1174,8 @@ struct hfi1_devdata {
 
 /* 8051 firmware version helper */
 #define dc8051_ver(a, b) ((a) << 8 | (b))
+#define dc8051_ver_maj(a) ((a & 0xff00) >> 8)
+#define dc8051_ver_min(a)  (a & 0x00ff)
 
 /* f_put_tid types */
 #define PT_EXPECTED 0
index 849c4b9..dd4be3c 100644 (file)
@@ -1291,9 +1291,12 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
 {
        struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+       u16 ver = dd->dc8051_ver;
 
        memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
 
+       rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 16) |
+                                   (u64)dc8051_ver_min(ver);
        rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
                        IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
                        IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
@@ -1567,6 +1570,17 @@ static void init_ibport(struct hfi1_pportdata *ppd)
        RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
 }
 
+static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str,
+                               size_t str_len)
+{
+       struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+       struct hfi1_ibdev *dev = dev_from_rdi(rdi);
+       u16 ver = dd_from_dev(dev)->dc8051_ver;
+
+       snprintf(str, str_len, "%u.%u", dc8051_ver_maj(ver),
+                dc8051_ver_min(ver));
+}
+
 /**
  * hfi1_register_ib_device - register our device with the infiniband core
  * @dd: the device data structure
@@ -1613,6 +1627,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 
        /* keep process mad in the driver */
        ibdev->process_mad = hfi1_process_mad;
+       ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
 
        strncpy(ibdev->node_desc, init_utsname()->nodename,
                sizeof(ibdev->node_desc));
index d2fa725..5026dc7 100644 (file)
@@ -1567,12 +1567,12 @@ static enum i40iw_status_code i40iw_del_multiple_qhash(
                ret = i40iw_manage_qhash(iwdev, cm_info,
                                         I40IW_QHASH_TYPE_TCP_SYN,
                                         I40IW_QHASH_MANAGE_TYPE_DELETE, NULL, false);
-               kfree(child_listen_node);
-               cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
                i40iw_debug(&iwdev->sc_dev,
                            I40IW_DEBUG_CM,
                            "freed pointer = %p\n",
                            child_listen_node);
+               kfree(child_listen_node);
+               cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
        }
        spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
 
index bd942da..2fac1db 100644 (file)
@@ -1557,6 +1557,9 @@ enum i40iw_alignment {
 #define I40IW_RING_MOVE_TAIL(_ring) \
        (_ring).tail = ((_ring).tail + 1) % (_ring).size
 
+#define I40IW_RING_MOVE_HEAD_NOCHECK(_ring) \
+       (_ring).head = ((_ring).head + 1) % (_ring).size
+
 #define I40IW_RING_MOVE_TAIL_BY_COUNT(_ring, _count) \
        (_ring).tail = ((_ring).tail + (_count)) % (_ring).size
 
index e9c6e82..c62d354 100644 (file)
@@ -1025,6 +1025,8 @@ static void  i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
        u16 txoffset, bufoffset;
 
        buf = i40iw_puda_get_listbuf(pbufl);
+       if (!buf)
+               return;
        nextseqnum = buf->seqnum + fpdu_len;
        txbuf->totallen = buf->hdrlen + fpdu_len;
        txbuf->data = (u8 *)txbuf->mem.va + buf->hdrlen;
@@ -1048,6 +1050,8 @@ static void  i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
                fpdu_len -= buf->datalen;
                i40iw_puda_ret_bufpool(ieq, buf);
                buf = i40iw_puda_get_listbuf(pbufl);
+               if (!buf)
+                       return;
                bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
        } while (1);
 
index 16cc617..2b1a04e 100644 (file)
@@ -667,7 +667,7 @@ struct i40iw_tcp_offload_info {
        bool time_stamp;
        u8 cwnd_inc_limit;
        bool drop_ooo_seg;
-       bool dup_ack_thresh;
+       u8 dup_ack_thresh;
        u8 ttl;
        u8 src_mac_addr_idx;
        bool avoid_stretch_ack;
index e35faea..4d28c3c 100644 (file)
@@ -291,9 +291,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
 
        i40iw_set_fragment(wqe, 0, op_info->lo_sg_list);
 
-       for (i = 1; i < op_info->num_lo_sges; i++) {
-               byte_off = 32 + (i - 1) * 16;
+       for (i = 1, byte_off = 32; i < op_info->num_lo_sges; i++) {
                i40iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i]);
+               byte_off += 16;
        }
 
        wmb(); /* make sure WQE is populated before valid bit is set */
@@ -401,9 +401,9 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
 
        i40iw_set_fragment(wqe, 0, op_info->sg_list);
 
-       for (i = 1; i < op_info->num_sges; i++) {
-               byte_off = 32 + (i - 1) * 16;
+       for (i = 1, byte_off = 32; i < op_info->num_sges; i++) {
                i40iw_set_fragment(wqe, byte_off, &op_info->sg_list[i]);
+               byte_off += 16;
        }
 
        wmb(); /* make sure WQE is populated before valid bit is set */
@@ -685,9 +685,9 @@ static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp,
 
        i40iw_set_fragment(wqe, 0, info->sg_list);
 
-       for (i = 1; i < info->num_sges; i++) {
-               byte_off = 32 + (i - 1) * 16;
+       for (i = 1, byte_off = 32; i < info->num_sges; i++) {
                i40iw_set_fragment(wqe, byte_off, &info->sg_list[i]);
+               byte_off += 16;
        }
 
        wmb(); /* make sure WQE is populated before valid bit is set */
@@ -753,8 +753,7 @@ static enum i40iw_status_code i40iw_cq_post_entries(struct i40iw_cq_uk *cq,
  * @post_cq: update cq tail
  */
 static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
-                                                      struct i40iw_cq_poll_info *info,
-                                                      bool post_cq)
+                                                      struct i40iw_cq_poll_info *info)
 {
        u64 comp_ctx, qword0, qword2, qword3, wqe_qword;
        u64 *cqe, *sw_wqe;
@@ -762,7 +761,6 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
        struct i40iw_ring *pring = NULL;
        u32 wqe_idx, q_type, array_idx = 0;
        enum i40iw_status_code ret_code = 0;
-       enum i40iw_status_code ret_code2 = 0;
        bool move_cq_head = true;
        u8 polarity;
        u8 addl_wqes = 0;
@@ -870,19 +868,14 @@ exit:
                        move_cq_head = false;
 
        if (move_cq_head) {
-               I40IW_RING_MOVE_HEAD(cq->cq_ring, ret_code2);
-
-               if (ret_code2 && !ret_code)
-                       ret_code = ret_code2;
+               I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
 
                if (I40IW_RING_GETCURRENT_HEAD(cq->cq_ring) == 0)
                        cq->polarity ^= 1;
 
-               if (post_cq) {
-                       I40IW_RING_MOVE_TAIL(cq->cq_ring);
-                       set_64bit_val(cq->shadow_area, 0,
-                                     I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
-               }
+               I40IW_RING_MOVE_TAIL(cq->cq_ring);
+               set_64bit_val(cq->shadow_area, 0,
+                             I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
        } else {
                if (info->is_srq)
                        return ret_code;
index 4627646..276bcef 100644 (file)
@@ -327,7 +327,7 @@ struct i40iw_cq_ops {
        void (*iw_cq_request_notification)(struct i40iw_cq_uk *,
                                           enum i40iw_completion_notify);
        enum i40iw_status_code (*iw_cq_poll_completion)(struct i40iw_cq_uk *,
-                                                       struct i40iw_cq_poll_info *, bool);
+                                                       struct i40iw_cq_poll_info *);
        enum i40iw_status_code (*iw_cq_post_entries)(struct i40iw_cq_uk *, u8 count);
        void (*iw_cq_clean)(void *, struct i40iw_cq_uk *);
 };
index 283b64c..2360338 100644 (file)
@@ -529,7 +529,7 @@ static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev,
                status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, 0, &rqshift);
 
        if (status)
-               return -ENOSYS;
+               return -ENOMEM;
 
        sqdepth = sq_size << sqshift;
        rqdepth = rq_size << rqshift;
@@ -671,7 +671,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
        iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp;
 
        if (init_attr->qp_type != IB_QPT_RC) {
-               err_code = -ENOSYS;
+               err_code = -EINVAL;
                goto error;
        }
        if (iwdev->push_mode)
@@ -1840,6 +1840,7 @@ struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *pd,
        iwmr->ibmr.lkey = stag;
        iwmr->page_cnt = 1;
        iwmr->pgaddrmem[0]  = addr;
+       iwmr->length = size;
        status = i40iw_hwreg_mr(iwdev, iwmr, access);
        if (status) {
                i40iw_free_stag(iwdev, stag);
@@ -1863,7 +1864,7 @@ static struct ib_mr *i40iw_get_dma_mr(struct ib_pd *pd, int acc)
 {
        u64 kva = 0;
 
-       return i40iw_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
+       return i40iw_reg_phys_mr(pd, 0, 0, acc, &kva);
 }
 
 /**
@@ -1974,18 +1975,6 @@ static ssize_t i40iw_show_rev(struct device *dev,
        return sprintf(buf, "%x\n", hw_rev);
 }
 
-/**
- * i40iw_show_fw_ver
- */
-static ssize_t i40iw_show_fw_ver(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       u32 firmware_version = I40IW_FW_VERSION;
-
-       return sprintf(buf, "%u.%u\n", firmware_version,
-                      (firmware_version & 0x000000ff));
-}
-
 /**
  * i40iw_show_hca
  */
@@ -2006,13 +1995,11 @@ static ssize_t i40iw_show_board(struct device *dev,
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, i40iw_show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL);
 
 static struct device_attribute *i40iw_dev_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type,
        &dev_attr_board_id
 };
@@ -2091,8 +2078,12 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                                ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
                        }
 
-                       if (ret)
-                               err = -EIO;
+                       if (ret) {
+                               if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                                       err = -ENOMEM;
+                               else
+                                       err = -EINVAL;
+                       }
                        break;
                case IB_WR_RDMA_WRITE:
                        info.op_type = I40IW_OP_TYPE_RDMA_WRITE;
@@ -2113,8 +2104,12 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                                ret = ukqp->ops.iw_rdma_write(ukqp, &info, false);
                        }
 
-                       if (ret)
-                               err = -EIO;
+                       if (ret) {
+                               if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                                       err = -ENOMEM;
+                               else
+                                       err = -EINVAL;
+                       }
                        break;
                case IB_WR_RDMA_READ_WITH_INV:
                        inv_stag = true;
@@ -2132,15 +2127,19 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                        info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
                        info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;
                        ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false);
-                       if (ret)
-                               err = -EIO;
+                       if (ret) {
+                               if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                                       err = -ENOMEM;
+                               else
+                                       err = -EINVAL;
+                       }
                        break;
                case IB_WR_LOCAL_INV:
                        info.op_type = I40IW_OP_TYPE_INV_STAG;
                        info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
                        ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true);
                        if (ret)
-                               err = -EIO;
+                               err = -ENOMEM;
                        break;
                case IB_WR_REG_MR:
                {
@@ -2174,7 +2173,7 @@ static int i40iw_post_send(struct ib_qp *ibqp,
 
                        ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true);
                        if (ret)
-                               err = -EIO;
+                               err = -ENOMEM;
                        break;
                }
                default:
@@ -2214,6 +2213,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
        struct i40iw_sge sg_list[I40IW_MAX_WQ_FRAGMENT_COUNT];
        enum i40iw_status_code ret = 0;
        unsigned long flags;
+       int err = 0;
 
        iwqp = (struct i40iw_qp *)ibqp;
        ukqp = &iwqp->sc_qp.qp_uk;
@@ -2228,6 +2228,10 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
                ret = ukqp->ops.iw_post_receive(ukqp, &post_recv);
                if (ret) {
                        i40iw_pr_err(" post_recv err %d\n", ret);
+                       if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                               err = -ENOMEM;
+                       else
+                               err = -EINVAL;
                        *bad_wr = ib_wr;
                        goto out;
                }
@@ -2235,9 +2239,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
        }
  out:
        spin_unlock_irqrestore(&iwqp->lock, flags);
-       if (ret)
-               return -ENOSYS;
-       return 0;
+       return err;
 }
 
 /**
@@ -2264,7 +2266,7 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
 
        spin_lock_irqsave(&iwcq->lock, flags);
        while (cqe_count < num_entries) {
-               ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info, true);
+               ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info);
                if (ret == I40IW_ERR_QUEUE_EMPTY) {
                        break;
                } else if (ret == I40IW_ERR_QUEUE_DESTROYED) {
@@ -2437,6 +2439,15 @@ static const char * const i40iw_hw_stat_names[] = {
                "iwRdmaInv"
 };
 
+static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str,
+                                size_t str_len)
+{
+       u32 firmware_version = I40IW_FW_VERSION;
+
+       snprintf(str, str_len, "%u.%u", firmware_version,
+                      (firmware_version & 0x000000ff));
+}
+
 /**
  * i40iw_alloc_hw_stats - Allocate a hw stats structure
  * @ibdev: device pointer from stack
@@ -2528,7 +2539,7 @@ static int i40iw_modify_port(struct ib_device *ibdev,
                             int port_modify_mask,
                             struct ib_port_modify *props)
 {
-       return 0;
+       return -ENOSYS;
 }
 
 /**
@@ -2660,6 +2671,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
        memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name,
               sizeof(iwibdev->ibdev.iwcm->ifname));
        iwibdev->ibdev.get_port_immutable   = i40iw_port_immutable;
+       iwibdev->ibdev.get_dev_fw_str       = i40iw_get_dev_fw_str;
        iwibdev->ibdev.poll_cq = i40iw_poll_cq;
        iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq;
        iwibdev->ibdev.post_send = i40iw_post_send;
@@ -2723,7 +2735,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
 
        iwdev->iwibdev = i40iw_init_rdma_device(iwdev);
        if (!iwdev->iwibdev)
-               return -ENOSYS;
+               return -ENOMEM;
        iwibdev = iwdev->iwibdev;
 
        ret = ib_register_device(&iwibdev->ibdev, NULL);
@@ -2748,5 +2760,5 @@ error:
        kfree(iwdev->iwibdev->ibdev.iwcm);
        iwdev->iwibdev->ibdev.iwcm = NULL;
        ib_dealloc_device(&iwdev->iwibdev->ibdev);
-       return -ENOSYS;
+       return ret;
 }
index 9f8b516..d6fc8a6 100644 (file)
@@ -288,7 +288,7 @@ static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
        if (cq->resize_buf)
                return -EBUSY;
 
-       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
        if (!cq->resize_buf)
                return -ENOMEM;
 
@@ -316,7 +316,7 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq
        if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
                return -EFAULT;
 
-       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
        if (!cq->resize_buf)
                return -ENOMEM;
 
index 42a4607..2af44c2 100644 (file)
@@ -2025,16 +2025,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
        return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
 }
 
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                          char *buf)
-{
-       struct mlx4_ib_dev *dev =
-               container_of(device, struct mlx4_ib_dev, ib_dev.dev);
-       return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
-                      (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
-                      (int) dev->dev->caps.fw_ver & 0xffff);
-}
-
 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
                        char *buf)
 {
@@ -2053,17 +2043,204 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
 }
 
 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
 
 static struct device_attribute *mlx4_class_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type,
        &dev_attr_board_id
 };
 
+struct diag_counter {
+       const char *name;
+       u32 offset;
+};
+
+#define DIAG_COUNTER(_name, _offset)                   \
+       { .name = #_name, .offset = _offset }
+
+static const struct diag_counter diag_basic[] = {
+       DIAG_COUNTER(rq_num_lle, 0x00),
+       DIAG_COUNTER(sq_num_lle, 0x04),
+       DIAG_COUNTER(rq_num_lqpoe, 0x08),
+       DIAG_COUNTER(sq_num_lqpoe, 0x0C),
+       DIAG_COUNTER(rq_num_lpe, 0x18),
+       DIAG_COUNTER(sq_num_lpe, 0x1C),
+       DIAG_COUNTER(rq_num_wrfe, 0x20),
+       DIAG_COUNTER(sq_num_wrfe, 0x24),
+       DIAG_COUNTER(sq_num_mwbe, 0x2C),
+       DIAG_COUNTER(sq_num_bre, 0x34),
+       DIAG_COUNTER(sq_num_rire, 0x44),
+       DIAG_COUNTER(rq_num_rire, 0x48),
+       DIAG_COUNTER(sq_num_rae, 0x4C),
+       DIAG_COUNTER(rq_num_rae, 0x50),
+       DIAG_COUNTER(sq_num_roe, 0x54),
+       DIAG_COUNTER(sq_num_tree, 0x5C),
+       DIAG_COUNTER(sq_num_rree, 0x64),
+       DIAG_COUNTER(rq_num_rnr, 0x68),
+       DIAG_COUNTER(sq_num_rnr, 0x6C),
+       DIAG_COUNTER(rq_num_oos, 0x100),
+       DIAG_COUNTER(sq_num_oos, 0x104),
+};
+
+static const struct diag_counter diag_ext[] = {
+       DIAG_COUNTER(rq_num_dup, 0x130),
+       DIAG_COUNTER(sq_num_to, 0x134),
+};
+
+static const struct diag_counter diag_device_only[] = {
+       DIAG_COUNTER(num_cqovf, 0x1A0),
+       DIAG_COUNTER(rq_num_udsdprd, 0x118),
+};
+
+static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev,
+                                                   u8 port_num)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibdev);
+       struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+
+       if (!diag[!!port_num].name)
+               return NULL;
+
+       return rdma_alloc_hw_stats_struct(diag[!!port_num].name,
+                                         diag[!!port_num].num_counters,
+                                         RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx4_ib_get_hw_stats(struct ib_device *ibdev,
+                               struct rdma_hw_stats *stats,
+                               u8 port, int index)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibdev);
+       struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+       u32 hw_value[ARRAY_SIZE(diag_device_only) +
+               ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {};
+       int ret;
+       int i;
+
+       ret = mlx4_query_diag_counters(dev->dev,
+                                      MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS,
+                                      diag[!!port].offset, hw_value,
+                                      diag[!!port].num_counters, port);
+
+       if (ret)
+               return ret;
+
+       for (i = 0; i < diag[!!port].num_counters; i++)
+               stats->value[i] = hw_value[i];
+
+       return diag[!!port].num_counters;
+}
+
+static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev,
+                                        const char ***name,
+                                        u32 **offset,
+                                        u32 *num,
+                                        bool port)
+{
+       u32 num_counters;
+
+       num_counters = ARRAY_SIZE(diag_basic);
+
+       if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT)
+               num_counters += ARRAY_SIZE(diag_ext);
+
+       if (!port)
+               num_counters += ARRAY_SIZE(diag_device_only);
+
+       *name = kcalloc(num_counters, sizeof(**name), GFP_KERNEL);
+       if (!*name)
+               return -ENOMEM;
+
+       *offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL);
+       if (!*offset)
+               goto err_name;
+
+       *num = num_counters;
+
+       return 0;
+
+err_name:
+       kfree(*name);
+       return -ENOMEM;
+}
+
+static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev,
+                                      const char **name,
+                                      u32 *offset,
+                                      bool port)
+{
+       int i;
+       int j;
+
+       for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) {
+               name[i] = diag_basic[i].name;
+               offset[i] = diag_basic[i].offset;
+       }
+
+       if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) {
+               for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) {
+                       name[j] = diag_ext[i].name;
+                       offset[j] = diag_ext[i].offset;
+               }
+       }
+
+       if (!port) {
+               for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) {
+                       name[j] = diag_device_only[i].name;
+                       offset[j] = diag_device_only[i].offset;
+               }
+       }
+}
+
+static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
+{
+       struct mlx4_ib_diag_counters *diag = ibdev->diag_counters;
+       int i;
+       int ret;
+       bool per_port = !!(ibdev->dev->caps.flags2 &
+               MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT);
+
+       for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+               /* i == 1 means we are building port counters */
+               if (i && !per_port)
+                       continue;
+
+               ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].name,
+                                                   &diag[i].offset,
+                                                   &diag[i].num_counters, i);
+               if (ret)
+                       goto err_alloc;
+
+               mlx4_ib_fill_diag_counters(ibdev, diag[i].name,
+                                          diag[i].offset, i);
+       }
+
+       ibdev->ib_dev.get_hw_stats      = mlx4_ib_get_hw_stats;
+       ibdev->ib_dev.alloc_hw_stats    = mlx4_ib_alloc_hw_stats;
+
+       return 0;
+
+err_alloc:
+       if (i) {
+               kfree(diag[i - 1].name);
+               kfree(diag[i - 1].offset);
+       }
+
+       return ret;
+}
+
+static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev)
+{
+       int i;
+
+       for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+               kfree(ibdev->diag_counters[i].offset);
+               kfree(ibdev->diag_counters[i].name);
+       }
+}
+
 #define MLX4_IB_INVALID_MAC    ((u64)-1)
 static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
                               struct net_device *dev,
@@ -2280,6 +2457,17 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void get_fw_ver_str(struct ib_device *device, char *str,
+                          size_t str_len)
+{
+       struct mlx4_ib_dev *dev =
+               container_of(device, struct mlx4_ib_dev, ib_dev);
+       snprintf(str, str_len, "%d.%d.%d",
+                (int) (dev->dev->caps.fw_ver >> 32),
+                (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+                (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
 static void *mlx4_ib_add(struct mlx4_dev *dev)
 {
        struct mlx4_ib_dev *ibdev;
@@ -2413,6 +2601,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        ibdev->ib_dev.detach_mcast      = mlx4_ib_mcg_detach;
        ibdev->ib_dev.process_mad       = mlx4_ib_process_mad;
        ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
+       ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
        ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
 
        if (!mlx4_is_slave(ibdev->dev)) {
@@ -2555,9 +2744,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
                atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
 
-       if (ib_register_device(&ibdev->ib_dev, NULL))
+       if (mlx4_ib_alloc_diag_counters(ibdev))
                goto err_steer_free_bitmap;
 
+       if (ib_register_device(&ibdev->ib_dev, NULL))
+               goto err_diag_counters;
+
        if (mlx4_ib_mad_init(ibdev))
                goto err_reg;
 
@@ -2623,6 +2815,9 @@ err_mad:
 err_reg:
        ib_unregister_device(&ibdev->ib_dev);
 
+err_diag_counters:
+       mlx4_ib_diag_cleanup(ibdev);
+
 err_steer_free_bitmap:
        kfree(ibdev->ib_uc_qpns_bitmap);
 
@@ -2726,6 +2921,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
        mlx4_ib_close_sriov(ibdev);
        mlx4_ib_mad_cleanup(ibdev);
        ib_unregister_device(&ibdev->ib_dev);
+       mlx4_ib_diag_cleanup(ibdev);
        if (ibdev->iboe.nb.notifier_call) {
                if (unregister_netdevice_notifier(&ibdev->iboe.nb))
                        pr_warn("failure unregistering notifier\n");
index 29acda2..7c5832e 100644 (file)
@@ -549,6 +549,14 @@ struct mlx4_ib_counters {
        u32                     default_counter;
 };
 
+#define MLX4_DIAG_COUNTERS_TYPES 2
+
+struct mlx4_ib_diag_counters {
+       const char **name;
+       u32 *offset;
+       u32 num_counters;
+};
+
 struct mlx4_ib_dev {
        struct ib_device        ib_dev;
        struct mlx4_dev        *dev;
@@ -585,6 +593,7 @@ struct mlx4_ib_dev {
        /* protect resources needed as part of reset flow */
        spinlock_t              reset_flow_resource_lock;
        struct list_head                qp_list;
+       struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES];
 };
 
 struct ib_event_work {
index 9c0e67b..308a358 100644 (file)
@@ -424,6 +424,83 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
        item->key = be32_to_cpu(cqe->mkey);
 }
 
+static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
+                        struct ib_wc *wc, int *npolled)
+{
+       struct mlx5_ib_wq *wq;
+       unsigned int cur;
+       unsigned int idx;
+       int np;
+       int i;
+
+       wq = &qp->sq;
+       cur = wq->head - wq->tail;
+       np = *npolled;
+
+       if (cur == 0)
+               return;
+
+       for (i = 0;  i < cur && np < num_entries; i++) {
+               idx = wq->last_poll & (wq->wqe_cnt - 1);
+               wc->wr_id = wq->wrid[idx];
+               wc->status = IB_WC_WR_FLUSH_ERR;
+               wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+               wq->tail++;
+               np++;
+               wc->qp = &qp->ibqp;
+               wc++;
+               wq->last_poll = wq->w_list[idx].next;
+       }
+       *npolled = np;
+}
+
+static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
+                        struct ib_wc *wc, int *npolled)
+{
+       struct mlx5_ib_wq *wq;
+       unsigned int cur;
+       int np;
+       int i;
+
+       wq = &qp->rq;
+       cur = wq->head - wq->tail;
+       np = *npolled;
+
+       if (cur == 0)
+               return;
+
+       for (i = 0;  i < cur && np < num_entries; i++) {
+               wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+               wc->status = IB_WC_WR_FLUSH_ERR;
+               wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+               wq->tail++;
+               np++;
+               wc->qp = &qp->ibqp;
+               wc++;
+       }
+       *npolled = np;
+}
+
+static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries,
+                                struct ib_wc *wc, int *npolled)
+{
+       struct mlx5_ib_qp *qp;
+
+       *npolled = 0;
+       /* Find uncompleted WQEs belonging to that cq and retrun mmics ones */
+       list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
+               sw_send_comp(qp, num_entries, wc + *npolled, npolled);
+               if (*npolled >= num_entries)
+                       return;
+       }
+
+       list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
+               sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
+               if (*npolled >= num_entries)
+                       return;
+       }
+}
+
 static int mlx5_poll_one(struct mlx5_ib_cq *cq,
                         struct mlx5_ib_qp **cur_qp,
                         struct ib_wc *wc)
@@ -594,12 +671,18 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 {
        struct mlx5_ib_cq *cq = to_mcq(ibcq);
        struct mlx5_ib_qp *cur_qp = NULL;
+       struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+       struct mlx5_core_dev *mdev = dev->mdev;
        unsigned long flags;
        int soft_polled = 0;
        int npolled;
        int err = 0;
 
        spin_lock_irqsave(&cq->lock, flags);
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+               goto out;
+       }
 
        if (unlikely(!list_empty(&cq->wc_list)))
                soft_polled = poll_soft_wc(cq, num_entries, wc);
@@ -612,7 +695,7 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 
        if (npolled)
                mlx5_cq_set_ci(&cq->mcq);
-
+out:
        spin_unlock_irqrestore(&cq->lock, flags);
 
        if (err == 0 || err == -EAGAIN)
@@ -843,6 +926,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
        cq->resize_buf = NULL;
        cq->resize_umem = NULL;
        cq->create_flags = attr->flags;
+       INIT_LIST_HEAD(&cq->list_send_qp);
+       INIT_LIST_HEAD(&cq->list_recv_qp);
 
        if (context) {
                err = create_cq_user(dev, udata, context, cq, entries,
index 53e03c8..79e6309 100644 (file)
@@ -69,15 +69,6 @@ static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev)
        return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn);
 }
 
-static u32 next_outstanding(struct mlx5_ib_gsi_qp *gsi, u32 index)
-{
-       return ++index % gsi->cap.max_send_wr;
-}
-
-#define for_each_outstanding_wr(gsi, index) \
-       for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; \
-            index = next_outstanding(gsi, index))
-
 /* Call with gsi->lock locked */
 static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
 {
@@ -85,8 +76,9 @@ static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
        struct mlx5_ib_gsi_wr *wr;
        u32 index;
 
-       for_each_outstanding_wr(gsi, index) {
-               wr = &gsi->outstanding_wrs[index];
+       for (index = gsi->outstanding_ci; index != gsi->outstanding_pi;
+            index++) {
+               wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr];
 
                if (!wr->completed)
                        break;
@@ -430,8 +422,9 @@ static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi,
                return -ENOMEM;
        }
 
-       gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi];
-       gsi->outstanding_pi = next_outstanding(gsi, gsi->outstanding_pi);
+       gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi %
+                                      gsi->cap.max_send_wr];
+       gsi->outstanding_pi++;
 
        if (!wc) {
                memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc));
index dad63f0..a84bb76 100644 (file)
 #include <asm/pat.h>
 #endif
 #include <linux/sched.h>
+#include <linux/delay.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
 #include <linux/mlx5/port.h>
 #include <linux/mlx5/vport.h>
+#include <linux/list.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
 #include <linux/in.h>
@@ -457,8 +459,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        int max_rq_sg;
        int max_sq_sg;
        u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+       struct mlx5_ib_query_device_resp resp = {};
+       size_t resp_len;
+       u64 max_tso;
 
-       if (uhw->inlen || uhw->outlen)
+       resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
+       if (uhw->outlen && uhw->outlen < resp_len)
+               return -EINVAL;
+       else
+               resp.response_length = resp_len;
+
+       if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
                return -EINVAL;
 
        memset(props, 0, sizeof(*props));
@@ -511,10 +522,21 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        if (MLX5_CAP_GEN(mdev, block_lb_mc))
                props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 
-       if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
-           (MLX5_CAP_ETH(dev->mdev, csum_cap)))
+       if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
+               if (MLX5_CAP_ETH(mdev, csum_cap))
                        props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
 
+               if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+                       max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
+                       if (max_tso) {
+                               resp.tso_caps.max_tso = 1 << max_tso;
+                               resp.tso_caps.supported_qpts |=
+                                       1 << IB_QPT_RAW_PACKET;
+                               resp.response_length += sizeof(resp.tso_caps);
+                       }
+               }
+       }
+
        if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
                props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
                props->device_cap_flags |= IB_DEVICE_UD_TSO;
@@ -576,6 +598,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        if (!mlx5_core_is_pf(mdev))
                props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
 
+       if (uhw->outlen) {
+               err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+
+               if (err)
+                       return err;
+       }
+
        return 0;
 }
 
@@ -983,6 +1012,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
                        goto out_uars;
        }
 
+       INIT_LIST_HEAD(&context->vma_private_list);
        INIT_LIST_HEAD(&context->db_page_list);
        mutex_init(&context->db_page_mutex);
 
@@ -992,6 +1022,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
        if (field_avail(typeof(resp), cqe_version, udata->outlen))
                resp.response_length += sizeof(resp.cqe_version);
 
+       if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
+               resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE;
+               resp.response_length += sizeof(resp.cmds_supp_uhw);
+       }
+
        /*
         * We don't want to expose information from the PCI bar that is located
         * after 4096 bytes, so if the arch only supports larger pages, let's
@@ -1006,8 +1041,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
                        offsetof(struct mlx5_init_seg, internal_timer_h) %
                        PAGE_SIZE;
                resp.response_length += sizeof(resp.hca_core_clock_offset) +
-                                       sizeof(resp.reserved2) +
-                                       sizeof(resp.reserved3);
+                                       sizeof(resp.reserved2);
        }
 
        err = ib_copy_to_udata(udata, &resp, resp.response_length);
@@ -1086,6 +1120,125 @@ static int get_index(unsigned long offset)
        return get_arg(offset);
 }
 
+static void  mlx5_ib_vma_open(struct vm_area_struct *area)
+{
+       /* vma_open is called when a new VMA is created on top of our VMA.  This
+        * is done through either mremap flow or split_vma (usually due to
+        * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
+        * as this VMA is strongly hardware related.  Therefore we set the
+        * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+        * calling us again and trying to do incorrect actions.  We assume that
+        * the original VMA size is exactly a single page, and therefore all
+        * "splitting" operation will not happen to it.
+        */
+       area->vm_ops = NULL;
+}
+
+static void  mlx5_ib_vma_close(struct vm_area_struct *area)
+{
+       struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
+
+       /* It's guaranteed that all VMAs opened on a FD are closed before the
+        * file itself is closed, therefore no sync is needed with the regular
+        * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
+        * However need a sync with accessing the vma as part of
+        * mlx5_ib_disassociate_ucontext.
+        * The close operation is usually called under mm->mmap_sem except when
+        * process is exiting.
+        * The exiting case is handled explicitly as part of
+        * mlx5_ib_disassociate_ucontext.
+        */
+       mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
+
+       /* setting the vma context pointer to null in the mlx5_ib driver's
+        * private data, to protect a race condition in
+        * mlx5_ib_disassociate_ucontext().
+        */
+       mlx5_ib_vma_priv_data->vma = NULL;
+       list_del(&mlx5_ib_vma_priv_data->list);
+       kfree(mlx5_ib_vma_priv_data);
+}
+
+static const struct vm_operations_struct mlx5_ib_vm_ops = {
+       .open = mlx5_ib_vma_open,
+       .close = mlx5_ib_vma_close
+};
+
+static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
+                               struct mlx5_ib_ucontext *ctx)
+{
+       struct mlx5_ib_vma_private_data *vma_prv;
+       struct list_head *vma_head = &ctx->vma_private_list;
+
+       vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
+       if (!vma_prv)
+               return -ENOMEM;
+
+       vma_prv->vma = vma;
+       vma->vm_private_data = vma_prv;
+       vma->vm_ops =  &mlx5_ib_vm_ops;
+
+       list_add(&vma_prv->list, vma_head);
+
+       return 0;
+}
+
+static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+       int ret;
+       struct vm_area_struct *vma;
+       struct mlx5_ib_vma_private_data *vma_private, *n;
+       struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+       struct task_struct *owning_process  = NULL;
+       struct mm_struct   *owning_mm       = NULL;
+
+       owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+       if (!owning_process)
+               return;
+
+       owning_mm = get_task_mm(owning_process);
+       if (!owning_mm) {
+               pr_info("no mm, disassociate ucontext is pending task termination\n");
+               while (1) {
+                       put_task_struct(owning_process);
+                       usleep_range(1000, 2000);
+                       owning_process = get_pid_task(ibcontext->tgid,
+                                                     PIDTYPE_PID);
+                       if (!owning_process ||
+                           owning_process->state == TASK_DEAD) {
+                               pr_info("disassociate ucontext done, task was terminated\n");
+                               /* in case task was dead need to release the
+                                * task struct.
+                                */
+                               if (owning_process)
+                                       put_task_struct(owning_process);
+                               return;
+                       }
+               }
+       }
+
+       /* need to protect from a race on closing the vma as part of
+        * mlx5_ib_vma_close.
+        */
+       down_read(&owning_mm->mmap_sem);
+       list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
+                                list) {
+               vma = vma_private->vma;
+               ret = zap_vma_ptes(vma, vma->vm_start,
+                                  PAGE_SIZE);
+               WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
+               /* context going to be destroyed, should
+                * not access ops any more.
+                */
+               vma->vm_ops = NULL;
+               list_del(&vma_private->list);
+               kfree(vma_private);
+       }
+       up_read(&owning_mm->mmap_sem);
+       mmput(owning_mm);
+       put_task_struct(owning_process);
+}
+
 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
 {
        switch (cmd) {
@@ -1101,8 +1254,10 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
 }
 
 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
-                   struct vm_area_struct *vma, struct mlx5_uuar_info *uuari)
+                   struct vm_area_struct *vma,
+                   struct mlx5_ib_ucontext *context)
 {
+       struct mlx5_uuar_info *uuari = &context->uuari;
        int err;
        unsigned long idx;
        phys_addr_t pfn, pa;
@@ -1152,14 +1307,13 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
        mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
                    vma->vm_start, &pa);
 
-       return 0;
+       return mlx5_ib_set_vma_data(vma, context);
 }
 
 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 {
        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
-       struct mlx5_uuar_info *uuari = &context->uuari;
        unsigned long command;
        phys_addr_t pfn;
 
@@ -1168,7 +1322,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
        case MLX5_IB_MMAP_WC_PAGE:
        case MLX5_IB_MMAP_NC_PAGE:
        case MLX5_IB_MMAP_REGULAR_PAGE:
-               return uar_mmap(dev, command, vma, uuari);
+               return uar_mmap(dev, command, vma, context);
 
        case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
                return -ENOSYS;
@@ -1331,6 +1485,32 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
                       &ib_spec->ipv4.val.dst_ip,
                       sizeof(ib_spec->ipv4.val.dst_ip));
                break;
+       case IB_FLOW_SPEC_IPV6:
+               if (ib_spec->size != sizeof(ib_spec->ipv6))
+                       return -EINVAL;
+
+               MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
+                        ethertype, 0xffff);
+               MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
+                        ethertype, ETH_P_IPV6);
+
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+                                   src_ipv4_src_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.mask.src_ip,
+                      sizeof(ib_spec->ipv6.mask.src_ip));
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+                                   src_ipv4_src_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.val.src_ip,
+                      sizeof(ib_spec->ipv6.val.src_ip));
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+                                   dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.mask.dst_ip,
+                      sizeof(ib_spec->ipv6.mask.dst_ip));
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+                                   dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.val.dst_ip,
+                      sizeof(ib_spec->ipv6.val.dst_ip));
+               break;
        case IB_FLOW_SPEC_TCP:
                if (ib_spec->size != sizeof(ib_spec->tcp_udp))
                        return -EINVAL;
@@ -1801,15 +1981,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
        return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
 }
 
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                          char *buf)
-{
-       struct mlx5_ib_dev *dev =
-               container_of(device, struct mlx5_ib_dev, ib_dev.dev);
-       return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev),
-                      fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
-}
-
 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
                        char *buf)
 {
@@ -1828,7 +1999,6 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
 }
 
 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
@@ -1836,7 +2006,6 @@ static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
 
 static struct device_attribute *mlx5_class_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type,
        &dev_attr_board_id,
        &dev_attr_fw_pages,
@@ -1854,6 +2023,65 @@ static void pkey_change_handler(struct work_struct *work)
        mutex_unlock(&ports->devr->mutex);
 }
 
+static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
+{
+       struct mlx5_ib_qp *mqp;
+       struct mlx5_ib_cq *send_mcq, *recv_mcq;
+       struct mlx5_core_cq *mcq;
+       struct list_head cq_armed_list;
+       unsigned long flags_qp;
+       unsigned long flags_cq;
+       unsigned long flags;
+
+       INIT_LIST_HEAD(&cq_armed_list);
+
+       /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+       spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+       list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+               spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+               if (mqp->sq.tail != mqp->sq.head) {
+                       send_mcq = to_mcq(mqp->ibqp.send_cq);
+                       spin_lock_irqsave(&send_mcq->lock, flags_cq);
+                       if (send_mcq->mcq.comp &&
+                           mqp->ibqp.send_cq->comp_handler) {
+                               if (!send_mcq->mcq.reset_notify_added) {
+                                       send_mcq->mcq.reset_notify_added = 1;
+                                       list_add_tail(&send_mcq->mcq.reset_notify,
+                                                     &cq_armed_list);
+                               }
+                       }
+                       spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+               }
+               spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+               spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+               /* no handling is needed for SRQ */
+               if (!mqp->ibqp.srq) {
+                       if (mqp->rq.tail != mqp->rq.head) {
+                               recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+                               spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+                               if (recv_mcq->mcq.comp &&
+                                   mqp->ibqp.recv_cq->comp_handler) {
+                                       if (!recv_mcq->mcq.reset_notify_added) {
+                                               recv_mcq->mcq.reset_notify_added = 1;
+                                               list_add_tail(&recv_mcq->mcq.reset_notify,
+                                                             &cq_armed_list);
+                                       }
+                               }
+                               spin_unlock_irqrestore(&recv_mcq->lock,
+                                                      flags_cq);
+                       }
+               }
+               spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+       }
+       /*At that point all inflight post send were put to be executed as of we
+        * lock/unlock above locks Now need to arm all involved CQs.
+        */
+       list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
+               mcq->comp(mcq);
+       }
+       spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+}
+
 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
                          enum mlx5_dev_event event, unsigned long param)
 {
@@ -1866,6 +2094,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
        case MLX5_DEV_EVENT_SYS_ERROR:
                ibdev->ib_active = false;
                ibev.event = IB_EVENT_DEVICE_FATAL;
+               mlx5_ib_handle_internal_error(ibdev);
                break;
 
        case MLX5_DEV_EVENT_PORT_UP:
@@ -2272,6 +2501,15 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void get_dev_fw_str(struct ib_device *ibdev, char *str,
+                          size_t str_len)
+{
+       struct mlx5_ib_dev *dev =
+               container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+       snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
+                      fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
+}
+
 static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
 {
        int err;
@@ -2298,6 +2536,113 @@ static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
        unregister_netdevice_notifier(&dev->roce.nb);
 }
 
+static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
+{
+       unsigned int i;
+
+       for (i = 0; i < dev->num_ports; i++)
+               mlx5_core_dealloc_q_counter(dev->mdev,
+                                           dev->port[i].q_cnt_id);
+}
+
+static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
+{
+       int i;
+       int ret;
+
+       for (i = 0; i < dev->num_ports; i++) {
+               ret = mlx5_core_alloc_q_counter(dev->mdev,
+                                               &dev->port[i].q_cnt_id);
+               if (ret) {
+                       mlx5_ib_warn(dev,
+                                    "couldn't allocate queue counter for port %d, err %d\n",
+                                    i + 1, ret);
+                       goto dealloc_counters;
+               }
+       }
+
+       return 0;
+
+dealloc_counters:
+       while (--i >= 0)
+               mlx5_core_dealloc_q_counter(dev->mdev,
+                                           dev->port[i].q_cnt_id);
+
+       return ret;
+}
+
+static const char * const names[] = {
+       "rx_write_requests",
+       "rx_read_requests",
+       "rx_atomic_requests",
+       "out_of_buffer",
+       "out_of_sequence",
+       "duplicate_request",
+       "rnr_nak_retry_err",
+       "packet_seq_err",
+       "implied_nak_seq_err",
+       "local_ack_timeout_err",
+};
+
+static const size_t stats_offsets[] = {
+       MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests),
+       MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests),
+       MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests),
+       MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer),
+       MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence),
+       MLX5_BYTE_OFF(query_q_counter_out, duplicate_request),
+       MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err),
+       MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err),
+       MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err),
+       MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err),
+};
+
+static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
+                                                   u8 port_num)
+{
+       BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets));
+
+       /* We support only per port stats */
+       if (port_num == 0)
+               return NULL;
+
+       return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names),
+                                         RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+                               struct rdma_hw_stats *stats,
+                               u8 port, int index)
+{
+       struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
+       void *out;
+       __be32 val;
+       int ret;
+       int i;
+
+       if (!port || !stats)
+               return -ENOSYS;
+
+       out = mlx5_vzalloc(outlen);
+       if (!out)
+               return -ENOMEM;
+
+       ret = mlx5_core_query_q_counter(dev->mdev,
+                                       dev->port[port - 1].q_cnt_id, 0,
+                                       out, outlen);
+       if (ret)
+               goto free;
+
+       for (i = 0; i < ARRAY_SIZE(names); i++) {
+               val = *(__be32 *)(out + stats_offsets[i]);
+               stats->value[i] = (u64)be32_to_cpu(val);
+       }
+free:
+       kvfree(out);
+       return ARRAY_SIZE(names);
+}
+
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 {
        struct mlx5_ib_dev *dev;
@@ -2320,10 +2665,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
        dev->mdev = mdev;
 
+       dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
+                           GFP_KERNEL);
+       if (!dev->port)
+               goto err_dealloc;
+
        rwlock_init(&dev->roce.netdev_lock);
        err = get_port_caps(dev);
        if (err)
-               goto err_dealloc;
+               goto err_free_port;
 
        if (mlx5_use_mad_ifc(dev))
                get_ext_port_caps(dev);
@@ -2418,6 +2768,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        dev->ib_dev.map_mr_sg           = mlx5_ib_map_mr_sg;
        dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
        dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
+       dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
        if (mlx5_core_is_pf(mdev)) {
                dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
                dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
@@ -2425,6 +2776,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                dev->ib_dev.set_vf_guid         = mlx5_ib_set_vf_guid;
        }
 
+       dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
+
        mlx5_ib_internal_fill_odp_caps(dev);
 
        if (MLX5_CAP_GEN(mdev, imaicl)) {
@@ -2435,6 +2788,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                        (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
        }
 
+       if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
+           MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+               dev->ib_dev.get_hw_stats        = mlx5_ib_get_hw_stats;
+               dev->ib_dev.alloc_hw_stats      = mlx5_ib_alloc_hw_stats;
+       }
+
        if (MLX5_CAP_GEN(mdev, xrc)) {
                dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
                dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@@ -2447,9 +2806,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
            IB_LINK_LAYER_ETHERNET) {
                dev->ib_dev.create_flow = mlx5_ib_create_flow;
                dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
+               dev->ib_dev.create_wq    = mlx5_ib_create_wq;
+               dev->ib_dev.modify_wq    = mlx5_ib_modify_wq;
+               dev->ib_dev.destroy_wq   = mlx5_ib_destroy_wq;
+               dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
+               dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
                dev->ib_dev.uverbs_ex_cmd_mask |=
                        (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
-                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
+                       (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
+                       (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
+                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
+                       (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
        }
        err = init_node_data(dev);
        if (err)
@@ -2457,6 +2826,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
        mutex_init(&dev->flow_db.lock);
        mutex_init(&dev->cap_mask_mutex);
+       INIT_LIST_HEAD(&dev->qp_list);
+       spin_lock_init(&dev->reset_flow_resource_lock);
 
        if (ll == IB_LINK_LAYER_ETHERNET) {
                err = mlx5_enable_roce(dev);
@@ -2472,10 +2843,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        if (err)
                goto err_rsrc;
 
-       err = ib_register_device(&dev->ib_dev, NULL);
+       err = mlx5_ib_alloc_q_counters(dev);
        if (err)
                goto err_odp;
 
+       err = ib_register_device(&dev->ib_dev, NULL);
+       if (err)
+               goto err_q_cnt;
+
        err = create_umr_res(dev);
        if (err)
                goto err_dev;
@@ -2497,6 +2872,9 @@ err_umrc:
 err_dev:
        ib_unregister_device(&dev->ib_dev);
 
+err_q_cnt:
+       mlx5_ib_dealloc_q_counters(dev);
+
 err_odp:
        mlx5_ib_odp_remove_one(dev);
 
@@ -2507,6 +2885,9 @@ err_disable_roce:
        if (ll == IB_LINK_LAYER_ETHERNET)
                mlx5_disable_roce(dev);
 
+err_free_port:
+       kfree(dev->port);
+
 err_dealloc:
        ib_dealloc_device((struct ib_device *)dev);
 
@@ -2519,11 +2900,13 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
 
        ib_unregister_device(&dev->ib_dev);
+       mlx5_ib_dealloc_q_counters(dev);
        destroy_umrc_res(dev);
        mlx5_ib_odp_remove_one(dev);
        destroy_dev_resources(&dev->devr);
        if (ll == IB_LINK_LAYER_ETHERNET)
                mlx5_disable_roce(dev);
+       kfree(dev->port);
        ib_dealloc_device(&dev->ib_dev);
 }
 
index c4a9825..372385d 100644 (file)
@@ -105,6 +105,11 @@ enum {
        MLX5_CQE_VERSION_V1,
 };
 
+struct mlx5_ib_vma_private_data {
+       struct list_head list;
+       struct vm_area_struct *vma;
+};
+
 struct mlx5_ib_ucontext {
        struct ib_ucontext      ibucontext;
        struct list_head        db_page_list;
@@ -116,6 +121,7 @@ struct mlx5_ib_ucontext {
        u8                      cqe_version;
        /* Transport Domain number */
        u32                     tdn;
+       struct list_head        vma_private_list;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -217,12 +223,41 @@ struct mlx5_ib_wq {
        void                   *qend;
 };
 
+struct mlx5_ib_rwq {
+       struct ib_wq            ibwq;
+       u32                     rqn;
+       u32                     rq_num_pas;
+       u32                     log_rq_stride;
+       u32                     log_rq_size;
+       u32                     rq_page_offset;
+       u32                     log_page_size;
+       struct ib_umem          *umem;
+       size_t                  buf_size;
+       unsigned int            page_shift;
+       int                     create_type;
+       struct mlx5_db          db;
+       u32                     user_index;
+       u32                     wqe_count;
+       u32                     wqe_shift;
+       int                     wq_sig;
+};
+
 enum {
        MLX5_QP_USER,
        MLX5_QP_KERNEL,
        MLX5_QP_EMPTY
 };
 
+enum {
+       MLX5_WQ_USER,
+       MLX5_WQ_KERNEL
+};
+
+struct mlx5_ib_rwq_ind_table {
+       struct ib_rwq_ind_table ib_rwq_ind_tbl;
+       u32                     rqtn;
+};
+
 /*
  * Connect-IB can trigger up to four concurrent pagefaults
  * per-QP.
@@ -266,6 +301,10 @@ struct mlx5_ib_qp_trans {
        u8                      resp_depth;
 };
 
+struct mlx5_ib_rss_qp {
+       u32     tirn;
+};
+
 struct mlx5_ib_rq {
        struct mlx5_ib_qp_base base;
        struct mlx5_ib_wq       *rq;
@@ -294,6 +333,7 @@ struct mlx5_ib_qp {
        union {
                struct mlx5_ib_qp_trans trans_qp;
                struct mlx5_ib_raw_packet_qp raw_packet_qp;
+               struct mlx5_ib_rss_qp rss_qp;
        };
        struct mlx5_buf         buf;
 
@@ -340,6 +380,9 @@ struct mlx5_ib_qp {
        spinlock_t              disable_page_faults_lock;
        struct mlx5_ib_pfault   pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
 #endif
+       struct list_head        qps_list;
+       struct list_head        cq_recv_list;
+       struct list_head        cq_send_list;
 };
 
 struct mlx5_ib_cq_buf {
@@ -401,6 +444,8 @@ struct mlx5_ib_cq {
        struct mlx5_ib_cq_buf  *resize_buf;
        struct ib_umem         *resize_umem;
        int                     cqe_size;
+       struct list_head        list_send_qp;
+       struct list_head        list_recv_qp;
        u32                     create_flags;
        struct list_head        wc_list;
        enum ib_cq_notify_flags notify_flags;
@@ -546,6 +591,10 @@ struct mlx5_ib_resources {
        struct mutex    mutex;
 };
 
+struct mlx5_ib_port {
+       u16 q_cnt_id;
+};
+
 struct mlx5_roce {
        /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
         * netdev pointer
@@ -581,6 +630,11 @@ struct mlx5_ib_dev {
        struct srcu_struct      mr_srcu;
 #endif
        struct mlx5_ib_flow_db  flow_db;
+       /* protect resources needed as part of reset flow */
+       spinlock_t              reset_flow_resource_lock;
+       struct list_head        qp_list;
+       /* Array with num_ports elements */
+       struct mlx5_ib_port     *port;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -628,6 +682,16 @@ static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp)
        return container_of(ibqp, struct mlx5_ib_qp, ibqp);
 }
 
+static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq)
+{
+       return container_of(ibwq, struct mlx5_ib_rwq, ibwq);
+}
+
+static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+       return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl);
+}
+
 static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
 {
        return container_of(msrq, struct mlx5_ib_srq, msrq);
@@ -762,6 +826,16 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
 int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
                            struct ib_mr_status *mr_status);
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+                               struct ib_wq_init_attr *init_attr,
+                               struct ib_udata *udata);
+int mlx5_ib_destroy_wq(struct ib_wq *wq);
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+                     u32 wq_attr_mask, struct ib_udata *udata);
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+                                                     struct ib_rwq_ind_table_init_attr *init_attr,
+                                                     struct ib_udata *udata);
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 extern struct workqueue_struct *mlx5_ib_page_fault_wq;
index 8cf2ce5..4b02130 100644 (file)
@@ -1193,12 +1193,16 @@ error:
 
 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
+       struct mlx5_core_dev *mdev = dev->mdev;
        struct umr_common *umrc = &dev->umrc;
        struct mlx5_ib_umr_context umr_context;
        struct mlx5_umr_wr umrwr = {};
        struct ib_send_wr *bad;
        int err;
 
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+               return 0;
+
        mlx5_ib_init_umr_context(&umr_context);
 
        umrwr.wr.wr_cqe = &umr_context.cqe;
index ce0a7ab..0dd7d93 100644 (file)
@@ -77,6 +77,10 @@ struct mlx5_wqe_eth_pad {
        u8 rsvd0[16];
 };
 
+static void get_cqs(enum ib_qp_type qp_type,
+                   struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
+                   struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq);
+
 static int is_qp0(enum ib_qp_type qp_type)
 {
        return qp_type == IB_QPT_SMI;
@@ -609,6 +613,11 @@ static int to_mlx5_st(enum ib_qp_type type)
        }
 }
 
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
+                            struct mlx5_ib_cq *recv_cq);
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
+                              struct mlx5_ib_cq *recv_cq);
+
 static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
 {
        return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
@@ -649,6 +658,71 @@ err_umem:
        return err;
 }
 
+static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq)
+{
+       struct mlx5_ib_ucontext *context;
+
+       context = to_mucontext(pd->uobject->context);
+       mlx5_ib_db_unmap_user(context, &rwq->db);
+       if (rwq->umem)
+               ib_umem_release(rwq->umem);
+}
+
+static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+                         struct mlx5_ib_rwq *rwq,
+                         struct mlx5_ib_create_wq *ucmd)
+{
+       struct mlx5_ib_ucontext *context;
+       int page_shift = 0;
+       int npages;
+       u32 offset = 0;
+       int ncont = 0;
+       int err;
+
+       if (!ucmd->buf_addr)
+               return -EINVAL;
+
+       context = to_mucontext(pd->uobject->context);
+       rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
+                              rwq->buf_size, 0, 0);
+       if (IS_ERR(rwq->umem)) {
+               mlx5_ib_dbg(dev, "umem_get failed\n");
+               err = PTR_ERR(rwq->umem);
+               return err;
+       }
+
+       mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, &npages, &page_shift,
+                          &ncont, NULL);
+       err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift,
+                                    &rwq->rq_page_offset);
+       if (err) {
+               mlx5_ib_warn(dev, "bad offset\n");
+               goto err_umem;
+       }
+
+       rwq->rq_num_pas = ncont;
+       rwq->page_shift = page_shift;
+       rwq->log_page_size =  page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+       rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE);
+
+       mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n",
+                   (unsigned long long)ucmd->buf_addr, rwq->buf_size,
+                   npages, page_shift, ncont, offset);
+
+       err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+       if (err) {
+               mlx5_ib_dbg(dev, "map failed\n");
+               goto err_umem;
+       }
+
+       rwq->create_type = MLX5_WQ_USER;
+       return 0;
+
+err_umem:
+       ib_umem_release(rwq->umem);
+       return err;
+}
+
 static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                          struct mlx5_ib_qp *qp, struct ib_udata *udata,
                          struct ib_qp_init_attr *attr,
@@ -1201,6 +1275,187 @@ static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
        rq->doorbell = &qp->db;
 }
 
+static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+       mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn);
+}
+
+static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                                struct ib_pd *pd,
+                                struct ib_qp_init_attr *init_attr,
+                                struct ib_udata *udata)
+{
+       struct ib_uobject *uobj = pd->uobject;
+       struct ib_ucontext *ucontext = uobj->context;
+       struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+       struct mlx5_ib_create_qp_resp resp = {};
+       int inlen;
+       int err;
+       u32 *in;
+       void *tirc;
+       void *hfso;
+       u32 selected_fields = 0;
+       size_t min_resp_len;
+       u32 tdn = mucontext->tdn;
+       struct mlx5_ib_create_qp_rss ucmd = {};
+       size_t required_cmd_sz;
+
+       if (init_attr->qp_type != IB_QPT_RAW_PACKET)
+               return -EOPNOTSUPP;
+
+       if (init_attr->create_flags || init_attr->send_cq)
+               return -EINVAL;
+
+       min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index);
+       if (udata->outlen < min_resp_len)
+               return -EINVAL;
+
+       required_cmd_sz = offsetof(typeof(ucmd), reserved1) + sizeof(ucmd.reserved1);
+       if (udata->inlen < required_cmd_sz) {
+               mlx5_ib_dbg(dev, "invalid inlen\n");
+               return -EINVAL;
+       }
+
+       if (udata->inlen > sizeof(ucmd) &&
+           !ib_is_udata_cleared(udata, sizeof(ucmd),
+                                udata->inlen - sizeof(ucmd))) {
+               mlx5_ib_dbg(dev, "inlen is not supported\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+               mlx5_ib_dbg(dev, "copy failed\n");
+               return -EFAULT;
+       }
+
+       if (ucmd.comp_mask) {
+               mlx5_ib_dbg(dev, "invalid comp mask\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)) || ucmd.reserved1) {
+               mlx5_ib_dbg(dev, "invalid reserved\n");
+               return -EOPNOTSUPP;
+       }
+
+       err = ib_copy_to_udata(udata, &resp, min_resp_len);
+       if (err) {
+               mlx5_ib_dbg(dev, "copy failed\n");
+               return -EINVAL;
+       }
+
+       inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+       MLX5_SET(tirc, tirc, disp_type,
+                MLX5_TIRC_DISP_TYPE_INDIRECT);
+       MLX5_SET(tirc, tirc, indirect_table,
+                init_attr->rwq_ind_tbl->ind_tbl_num);
+       MLX5_SET(tirc, tirc, transport_domain, tdn);
+
+       hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+       switch (ucmd.rx_hash_function) {
+       case MLX5_RX_HASH_FUNC_TOEPLITZ:
+       {
+               void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
+               size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key);
+
+               if (len != ucmd.rx_key_len) {
+                       err = -EINVAL;
+                       goto err;
+               }
+
+               MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
+               MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+               memcpy(rss_key, ucmd.rx_hash_key, len);
+               break;
+       }
+       default:
+               err = -EOPNOTSUPP;
+               goto err;
+       }
+
+       if (!ucmd.rx_hash_fields_mask) {
+               /* special case when this TIR serves as steering entry without hashing */
+               if (!init_attr->rwq_ind_tbl->log_ind_tbl_size)
+                       goto create_tir;
+               err = -EINVAL;
+               goto err;
+       }
+
+       if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) &&
+            ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) {
+               err = -EINVAL;
+               goto err;
+       }
+
+       /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4))
+               MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+                        MLX5_L3_PROT_TYPE_IPV4);
+       else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+                (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+               MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+                        MLX5_L3_PROT_TYPE_IPV6);
+
+       if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) &&
+            ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) {
+               err = -EINVAL;
+               goto err;
+       }
+
+       /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
+               MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+                        MLX5_L4_PROT_TYPE_TCP);
+       else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+                (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+               MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+                        MLX5_L4_PROT_TYPE_UDP);
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6))
+               selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP;
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+               selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP;
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP))
+               selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT;
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+               selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT;
+
+       MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);
+
+create_tir:
+       err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);
+
+       if (err)
+               goto err;
+
+       kvfree(in);
+       /* qpn is reserved for that QP */
+       qp->trans_qp.base.mqp.qpn = 0;
+       return 0;
+
+err:
+       kvfree(in);
+       return err;
+}
+
 static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                            struct ib_qp_init_attr *init_attr,
                            struct ib_udata *udata, struct mlx5_ib_qp *qp)
@@ -1211,6 +1466,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        struct mlx5_ib_create_qp_resp resp;
        struct mlx5_create_qp_mbox_in *in;
        struct mlx5_ib_create_qp ucmd;
+       struct mlx5_ib_cq *send_cq;
+       struct mlx5_ib_cq *recv_cq;
+       unsigned long flags;
        int inlen = sizeof(*in);
        int err;
        u32 uidx = MLX5_IB_DEFAULT_UIDX;
@@ -1227,6 +1485,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        spin_lock_init(&qp->sq.lock);
        spin_lock_init(&qp->rq.lock);
 
+       if (init_attr->rwq_ind_tbl) {
+               if (!udata)
+                       return -ENOSYS;
+
+               err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata);
+               return err;
+       }
+
        if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
                if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
                        mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
@@ -1460,6 +1726,23 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        base->container_mibqp = qp;
        base->mqp.event = mlx5_ib_qp_event;
 
+       get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq,
+               &send_cq, &recv_cq);
+       spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+       mlx5_ib_lock_cqs(send_cq, recv_cq);
+       /* Maintain device to QPs access, needed for further handling via reset
+        * flow
+        */
+       list_add_tail(&qp->qps_list, &dev->qp_list);
+       /* Maintain CQ to QPs access, needed for further handling via reset flow
+        */
+       if (send_cq)
+               list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp);
+       if (recv_cq)
+               list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp);
+       mlx5_ib_unlock_cqs(send_cq, recv_cq);
+       spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
        return 0;
 
 err_create:
@@ -1478,23 +1761,23 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv
        if (send_cq) {
                if (recv_cq) {
                        if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
-                               spin_lock_irq(&send_cq->lock);
+                               spin_lock(&send_cq->lock);
                                spin_lock_nested(&recv_cq->lock,
                                                 SINGLE_DEPTH_NESTING);
                        } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
-                               spin_lock_irq(&send_cq->lock);
+                               spin_lock(&send_cq->lock);
                                __acquire(&recv_cq->lock);
                        } else {
-                               spin_lock_irq(&recv_cq->lock);
+                               spin_lock(&recv_cq->lock);
                                spin_lock_nested(&send_cq->lock,
                                                 SINGLE_DEPTH_NESTING);
                        }
                } else {
-                       spin_lock_irq(&send_cq->lock);
+                       spin_lock(&send_cq->lock);
                        __acquire(&recv_cq->lock);
                }
        } else if (recv_cq) {
-               spin_lock_irq(&recv_cq->lock);
+               spin_lock(&recv_cq->lock);
                __acquire(&send_cq->lock);
        } else {
                __acquire(&send_cq->lock);
@@ -1509,21 +1792,21 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *re
                if (recv_cq) {
                        if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
                                spin_unlock(&recv_cq->lock);
-                               spin_unlock_irq(&send_cq->lock);
+                               spin_unlock(&send_cq->lock);
                        } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
                                __release(&recv_cq->lock);
-                               spin_unlock_irq(&send_cq->lock);
+                               spin_unlock(&send_cq->lock);
                        } else {
                                spin_unlock(&send_cq->lock);
-                               spin_unlock_irq(&recv_cq->lock);
+                               spin_unlock(&recv_cq->lock);
                        }
                } else {
                        __release(&recv_cq->lock);
-                       spin_unlock_irq(&send_cq->lock);
+                       spin_unlock(&send_cq->lock);
                }
        } else if (recv_cq) {
                __release(&send_cq->lock);
-               spin_unlock_irq(&recv_cq->lock);
+               spin_unlock(&recv_cq->lock);
        } else {
                __release(&recv_cq->lock);
                __release(&send_cq->lock);
@@ -1535,17 +1818,18 @@ static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
        return to_mpd(qp->ibqp.pd);
 }
 
-static void get_cqs(struct mlx5_ib_qp *qp,
+static void get_cqs(enum ib_qp_type qp_type,
+                   struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
                    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
 {
-       switch (qp->ibqp.qp_type) {
+       switch (qp_type) {
        case IB_QPT_XRC_TGT:
                *send_cq = NULL;
                *recv_cq = NULL;
                break;
        case MLX5_IB_QPT_REG_UMR:
        case IB_QPT_XRC_INI:
-               *send_cq = to_mcq(qp->ibqp.send_cq);
+               *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
                *recv_cq = NULL;
                break;
 
@@ -1557,8 +1841,8 @@ static void get_cqs(struct mlx5_ib_qp *qp,
        case IB_QPT_RAW_IPV6:
        case IB_QPT_RAW_ETHERTYPE:
        case IB_QPT_RAW_PACKET:
-               *send_cq = to_mcq(qp->ibqp.send_cq);
-               *recv_cq = to_mcq(qp->ibqp.recv_cq);
+               *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
+               *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL;
                break;
 
        case IB_QPT_MAX:
@@ -1577,8 +1861,14 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
        struct mlx5_ib_cq *send_cq, *recv_cq;
        struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
        struct mlx5_modify_qp_mbox_in *in;
+       unsigned long flags;
        int err;
 
+       if (qp->ibqp.rwq_ind_tbl) {
+               destroy_rss_raw_qp_tir(dev, qp);
+               return;
+       }
+
        base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
               &qp->raw_packet_qp.rq.base :
               &qp->trans_qp.base;
@@ -1602,17 +1892,28 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
                                     base->mqp.qpn);
        }
 
-       get_cqs(qp, &send_cq, &recv_cq);
+       get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+               &send_cq, &recv_cq);
+
+       spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+       mlx5_ib_lock_cqs(send_cq, recv_cq);
+       /* del from lists under both locks above to protect reset flow paths */
+       list_del(&qp->qps_list);
+       if (send_cq)
+               list_del(&qp->cq_send_list);
+
+       if (recv_cq)
+               list_del(&qp->cq_recv_list);
 
        if (qp->create_type == MLX5_QP_KERNEL) {
-               mlx5_ib_lock_cqs(send_cq, recv_cq);
                __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
                                   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
                if (send_cq != recv_cq)
                        __mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
                                           NULL);
-               mlx5_ib_unlock_cqs(send_cq, recv_cq);
        }
+       mlx5_ib_unlock_cqs(send_cq, recv_cq);
+       spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 
        if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
                destroy_raw_packet_qp(dev, qp);
@@ -2300,7 +2601,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
        }
 
        pd = get_pd(qp);
-       get_cqs(qp, &send_cq, &recv_cq);
+       get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+               &send_cq, &recv_cq);
 
        context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
        context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
@@ -2349,6 +2651,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
        else
                sqd_event = 0;
 
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+               u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
+                              qp->port) - 1;
+               struct mlx5_ib_port *mibport = &dev->port[port_num];
+
+               context->qp_counter_set_usr_page |=
+                       cpu_to_be32((u32)(mibport->q_cnt_id) << 24);
+       }
+
        if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
                context->sq_crq_size |= cpu_to_be16(1 << 4);
 
@@ -2439,6 +2750,9 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        int port;
        enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
 
+       if (ibqp->rwq_ind_tbl)
+               return -ENOSYS;
+
        if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);
 
@@ -3397,6 +3711,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 {
        struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx5_core_dev *mdev = dev->mdev;
        struct mlx5_ib_qp *qp;
        struct mlx5_ib_mr *mr;
        struct mlx5_wqe_data_seg *dpseg;
@@ -3424,6 +3739,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
        spin_lock_irqsave(&qp->sq.lock, flags);
 
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               err = -EIO;
+               *bad_wr = wr;
+               nreq = 0;
+               goto out;
+       }
+
        for (nreq = 0; wr; nreq++, wr = wr->next) {
                if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
                        mlx5_ib_warn(dev, "\n");
@@ -3725,6 +4047,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
        struct mlx5_ib_qp *qp = to_mqp(ibqp);
        struct mlx5_wqe_data_seg *scat;
        struct mlx5_rwqe_sig *sig;
+       struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx5_core_dev *mdev = dev->mdev;
        unsigned long flags;
        int err = 0;
        int nreq;
@@ -3736,6 +4060,13 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 
        spin_lock_irqsave(&qp->rq.lock, flags);
 
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               err = -EIO;
+               *bad_wr = wr;
+               nreq = 0;
+               goto out;
+       }
+
        ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 
        for (nreq = 0; wr; nreq++, wr = wr->next) {
@@ -4055,6 +4386,9 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
        int err = 0;
        u8 raw_packet_qp_state;
 
+       if (ibqp->rwq_ind_tbl)
+               return -ENOSYS;
+
        if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
                                            qp_init_attr);
@@ -4164,3 +4498,322 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 
        return 0;
 }
+
+static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
+                     struct ib_wq_init_attr *init_attr)
+{
+       struct mlx5_ib_dev *dev;
+       __be64 *rq_pas0;
+       void *in;
+       void *rqc;
+       void *wq;
+       int inlen;
+       int err;
+
+       dev = to_mdev(pd->device);
+
+       inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas;
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+       MLX5_SET(rqc,  rqc, mem_rq_type,
+                MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+       MLX5_SET(rqc, rqc, user_index, rwq->user_index);
+       MLX5_SET(rqc,  rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn);
+       MLX5_SET(rqc,  rqc, state, MLX5_RQC_STATE_RST);
+       MLX5_SET(rqc,  rqc, flush_in_error_en, 1);
+       wq = MLX5_ADDR_OF(rqc, rqc, wq);
+       MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+       MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+       MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
+       MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
+       MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
+       MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset);
+       MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size);
+       MLX5_SET(wq, wq, wq_signature, rwq->wq_sig);
+       MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma);
+       rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+       mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
+       err = mlx5_core_create_rq(dev->mdev, in, inlen, &rwq->rqn);
+       kvfree(in);
+       return err;
+}
+
+static int set_user_rq_size(struct mlx5_ib_dev *dev,
+                           struct ib_wq_init_attr *wq_init_attr,
+                           struct mlx5_ib_create_wq *ucmd,
+                           struct mlx5_ib_rwq *rwq)
+{
+       /* Sanity check RQ size before proceeding */
+       if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz)))
+               return -EINVAL;
+
+       if (!ucmd->rq_wqe_count)
+               return -EINVAL;
+
+       rwq->wqe_count = ucmd->rq_wqe_count;
+       rwq->wqe_shift = ucmd->rq_wqe_shift;
+       rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift);
+       rwq->log_rq_stride = rwq->wqe_shift;
+       rwq->log_rq_size = ilog2(rwq->wqe_count);
+       return 0;
+}
+
+static int prepare_user_rq(struct ib_pd *pd,
+                          struct ib_wq_init_attr *init_attr,
+                          struct ib_udata *udata,
+                          struct mlx5_ib_rwq *rwq)
+{
+       struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       struct mlx5_ib_create_wq ucmd = {};
+       int err;
+       size_t required_cmd_sz;
+
+       required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+       if (udata->inlen < required_cmd_sz) {
+               mlx5_ib_dbg(dev, "invalid inlen\n");
+               return -EINVAL;
+       }
+
+       if (udata->inlen > sizeof(ucmd) &&
+           !ib_is_udata_cleared(udata, sizeof(ucmd),
+                                udata->inlen - sizeof(ucmd))) {
+               mlx5_ib_dbg(dev, "inlen is not supported\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+               mlx5_ib_dbg(dev, "copy failed\n");
+               return -EFAULT;
+       }
+
+       if (ucmd.comp_mask) {
+               mlx5_ib_dbg(dev, "invalid comp mask\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (ucmd.reserved) {
+               mlx5_ib_dbg(dev, "invalid reserved\n");
+               return -EOPNOTSUPP;
+       }
+
+       err = set_user_rq_size(dev, init_attr, &ucmd, rwq);
+       if (err) {
+               mlx5_ib_dbg(dev, "err %d\n", err);
+               return err;
+       }
+
+       err = create_user_rq(dev, pd, rwq, &ucmd);
+       if (err) {
+               mlx5_ib_dbg(dev, "err %d\n", err);
+               if (err)
+                       return err;
+       }
+
+       rwq->user_index = ucmd.user_index;
+       return 0;
+}
+
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+                               struct ib_wq_init_attr *init_attr,
+                               struct ib_udata *udata)
+{
+       struct mlx5_ib_dev *dev;
+       struct mlx5_ib_rwq *rwq;
+       struct mlx5_ib_create_wq_resp resp = {};
+       size_t min_resp_len;
+       int err;
+
+       if (!udata)
+               return ERR_PTR(-ENOSYS);
+
+       min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+       if (udata->outlen && udata->outlen < min_resp_len)
+               return ERR_PTR(-EINVAL);
+
+       dev = to_mdev(pd->device);
+       switch (init_attr->wq_type) {
+       case IB_WQT_RQ:
+               rwq = kzalloc(sizeof(*rwq), GFP_KERNEL);
+               if (!rwq)
+                       return ERR_PTR(-ENOMEM);
+               err = prepare_user_rq(pd, init_attr, udata, rwq);
+               if (err)
+                       goto err;
+               err = create_rq(rwq, pd, init_attr);
+               if (err)
+                       goto err_user_rq;
+               break;
+       default:
+               mlx5_ib_dbg(dev, "unsupported wq type %d\n",
+                           init_attr->wq_type);
+               return ERR_PTR(-EINVAL);
+       }
+
+       rwq->ibwq.wq_num = rwq->rqn;
+       rwq->ibwq.state = IB_WQS_RESET;
+       if (udata->outlen) {
+               resp.response_length = offsetof(typeof(resp), response_length) +
+                               sizeof(resp.response_length);
+               err = ib_copy_to_udata(udata, &resp, resp.response_length);
+               if (err)
+                       goto err_copy;
+       }
+
+       return &rwq->ibwq;
+
+err_copy:
+       mlx5_core_destroy_rq(dev->mdev, rwq->rqn);
+err_user_rq:
+       destroy_user_rq(pd, rwq);
+err:
+       kfree(rwq);
+       return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_wq(struct ib_wq *wq)
+{
+       struct mlx5_ib_dev *dev = to_mdev(wq->device);
+       struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+
+       mlx5_core_destroy_rq(dev->mdev, rwq->rqn);
+       destroy_user_rq(wq->pd, rwq);
+       kfree(rwq);
+
+       return 0;
+}
+
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+                                                     struct ib_rwq_ind_table_init_attr *init_attr,
+                                                     struct ib_udata *udata)
+{
+       struct mlx5_ib_dev *dev = to_mdev(device);
+       struct mlx5_ib_rwq_ind_table *rwq_ind_tbl;
+       int sz = 1 << init_attr->log_ind_tbl_size;
+       struct mlx5_ib_create_rwq_ind_tbl_resp resp = {};
+       size_t min_resp_len;
+       int inlen;
+       int err;
+       int i;
+       u32 *in;
+       void *rqtc;
+
+       if (udata->inlen > 0 &&
+           !ib_is_udata_cleared(udata, 0,
+                                udata->inlen))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+       if (udata->outlen && udata->outlen < min_resp_len)
+               return ERR_PTR(-EINVAL);
+
+       rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL);
+       if (!rwq_ind_tbl)
+               return ERR_PTR(-ENOMEM);
+
+       inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+       in = mlx5_vzalloc(inlen);
+       if (!in) {
+               err = -ENOMEM;
+               goto err;
+       }
+
+       rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+       MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+       MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+       for (i = 0; i < sz; i++)
+               MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num);
+
+       err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn);
+       kvfree(in);
+
+       if (err)
+               goto err;
+
+       rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn;
+       if (udata->outlen) {
+               resp.response_length = offsetof(typeof(resp), response_length) +
+                                       sizeof(resp.response_length);
+               err = ib_copy_to_udata(udata, &resp, resp.response_length);
+               if (err)
+                       goto err_copy;
+       }
+
+       return &rwq_ind_tbl->ib_rwq_ind_tbl;
+
+err_copy:
+       mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+err:
+       kfree(rwq_ind_tbl);
+       return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+       struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl);
+       struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device);
+
+       mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+
+       kfree(rwq_ind_tbl);
+       return 0;
+}
+
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+                     u32 wq_attr_mask, struct ib_udata *udata)
+{
+       struct mlx5_ib_dev *dev = to_mdev(wq->device);
+       struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+       struct mlx5_ib_modify_wq ucmd = {};
+       size_t required_cmd_sz;
+       int curr_wq_state;
+       int wq_state;
+       int inlen;
+       int err;
+       void *rqc;
+       void *in;
+
+       required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+       if (udata->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (udata->inlen > sizeof(ucmd) &&
+           !ib_is_udata_cleared(udata, sizeof(ucmd),
+                                udata->inlen - sizeof(ucmd)))
+               return -EOPNOTSUPP;
+
+       if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+               return -EFAULT;
+
+       if (ucmd.comp_mask || ucmd.reserved)
+               return -EOPNOTSUPP;
+
+       inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+       curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ?
+               wq_attr->curr_wq_state : wq->state;
+       wq_state = (wq_attr_mask & IB_WQ_STATE) ?
+               wq_attr->wq_state : curr_wq_state;
+       if (curr_wq_state == IB_WQS_ERR)
+               curr_wq_state = MLX5_RQC_STATE_ERR;
+       if (wq_state == IB_WQS_ERR)
+               wq_state = MLX5_RQC_STATE_ERR;
+       MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
+       MLX5_SET(rqc, rqc, state, wq_state);
+
+       err = mlx5_core_modify_rq(dev->mdev, rwq->rqn, in, inlen);
+       kvfree(in);
+       if (!err)
+               rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state;
+
+       return err;
+}
index 3b2ddd6..ed6ac52 100644 (file)
@@ -74,14 +74,12 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
 }
 
 static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
-                          struct mlx5_create_srq_mbox_in **in,
-                          struct ib_udata *udata, int buf_size, int *inlen,
-                          int is_xrc)
+                          struct mlx5_srq_attr *in,
+                          struct ib_udata *udata, int buf_size)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5_ib_create_srq ucmd = {};
        size_t ucmdlen;
-       void *xsrqc;
        int err;
        int npages;
        int page_shift;
@@ -104,7 +102,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                                 udata->inlen - sizeof(ucmd)))
                return -EINVAL;
 
-       if (is_xrc) {
+       if (in->type == IB_SRQT_XRC) {
                err = get_srq_user_index(to_mucontext(pd->uobject->context),
                                         &ucmd, udata->inlen, &uidx);
                if (err)
@@ -130,14 +128,13 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                goto err_umem;
        }
 
-       *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
-       *in = mlx5_vzalloc(*inlen);
-       if (!(*in)) {
+       in->pas = mlx5_vzalloc(sizeof(*in->pas) * ncont);
+       if (!in->pas) {
                err = -ENOMEM;
                goto err_umem;
        }
 
-       mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0);
+       mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
 
        err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
                                  ucmd.db_addr, &srq->db);
@@ -146,20 +143,16 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                goto err_in;
        }
 
-       (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
-       (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
-
-       if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
-            is_xrc){
-               xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
-                                    xrc_srq_context_entry);
-               MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
-       }
+       in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+       in->page_offset = offset;
+       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+           in->type == IB_SRQT_XRC)
+               in->user_index = uidx;
 
        return 0;
 
 err_in:
-       kvfree(*in);
+       kvfree(in->pas);
 
 err_umem:
        ib_umem_release(srq->umem);
@@ -168,15 +161,13 @@ err_umem:
 }
 
 static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
-                            struct mlx5_create_srq_mbox_in **in, int buf_size,
-                            int *inlen, int is_xrc)
+                            struct mlx5_srq_attr *in, int buf_size)
 {
        int err;
        int i;
        struct mlx5_wqe_srq_next_seg *next;
        int page_shift;
        int npages;
-       void *xsrqc;
 
        err = mlx5_db_alloc(dev->mdev, &srq->db);
        if (err) {
@@ -204,13 +195,12 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
        npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT));
        mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n",
                    buf_size, page_shift, srq->buf.npages, npages);
-       *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages;
-       *in = mlx5_vzalloc(*inlen);
-       if (!*in) {
+       in->pas = mlx5_vzalloc(sizeof(*in->pas) * npages);
+       if (!in->pas) {
                err = -ENOMEM;
                goto err_buf;
        }
-       mlx5_fill_page_array(&srq->buf, (*in)->pas);
+       mlx5_fill_page_array(&srq->buf, in->pas);
 
        srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL);
        if (!srq->wrid) {
@@ -221,20 +211,15 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
        }
        srq->wq_sig = !!srq_signature;
 
-       (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
-
-       if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
-            is_xrc){
-               xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
-                                    xrc_srq_context_entry);
-               /* 0xffffff means we ask to work with cqe version 0 */
-               MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX);
-       }
+       in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+           in->type == IB_SRQT_XRC)
+               in->user_index = MLX5_IB_DEFAULT_UIDX;
 
        return 0;
 
 err_in:
-       kvfree(*in);
+       kvfree(in->pas);
 
 err_buf:
        mlx5_buf_free(dev->mdev, &srq->buf);
@@ -267,10 +252,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
        int desc_size;
        int buf_size;
        int err;
-       struct mlx5_create_srq_mbox_in *uninitialized_var(in);
-       int uninitialized_var(inlen);
-       int is_xrc;
-       u32 flgs, xrcdn;
+       struct mlx5_srq_attr in = {0};
        __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
 
        /* Sanity check SRQ size before proceeding */
@@ -302,14 +284,10 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
                    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
                    srq->msrq.max_avail_gather);
 
-       is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
-
        if (pd->uobject)
-               err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen,
-                                     is_xrc);
+               err = create_srq_user(pd, srq, &in, udata, buf_size);
        else
-               err = create_srq_kernel(dev, srq, &in, buf_size, &inlen,
-                                       is_xrc);
+               err = create_srq_kernel(dev, srq, &in, buf_size);
 
        if (err) {
                mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
@@ -317,23 +295,23 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
                goto err_srq;
        }
 
-       in->ctx.state_log_sz = ilog2(srq->msrq.max);
-       flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
-       xrcdn = 0;
-       if (is_xrc) {
-               xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
-               in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn);
+       in.type = init_attr->srq_type;
+       in.log_size = ilog2(srq->msrq.max);
+       in.wqe_shift = srq->msrq.wqe_shift - 4;
+       if (srq->wq_sig)
+               in.flags |= MLX5_SRQ_FLAG_WQ_SIG;
+       if (init_attr->srq_type == IB_SRQT_XRC) {
+               in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
+               in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn;
        } else if (init_attr->srq_type == IB_SRQT_BASIC) {
-               xrcdn = to_mxrcd(dev->devr.x0)->xrcdn;
-               in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn);
+               in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn;
+               in.cqn = to_mcq(dev->devr.c0)->mcq.cqn;
        }
 
-       in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF));
-
-       in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
-       in->ctx.db_record = cpu_to_be64(srq->db.dma);
-       err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc);
-       kvfree(in);
+       in.pd = to_mpd(pd)->pdn;
+       in.db_record = srq->db.dma;
+       err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in);
+       kvfree(in.pas);
        if (err) {
                mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
                goto err_usr_kern_srq;
@@ -401,7 +379,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
        struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
        struct mlx5_ib_srq *srq = to_msrq(ibsrq);
        int ret;
-       struct mlx5_query_srq_mbox_out *out;
+       struct mlx5_srq_attr *out;
 
        out = kzalloc(sizeof(*out), GFP_KERNEL);
        if (!out)
@@ -411,7 +389,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
        if (ret)
                goto out_box;
 
-       srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm);
+       srq_attr->srq_limit = out->lwm;
        srq_attr->max_wr    = srq->msrq.max - 1;
        srq_attr->max_sge   = srq->msrq.max_gs;
 
@@ -458,6 +436,8 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
        struct mlx5_ib_srq *srq = to_msrq(ibsrq);
        struct mlx5_wqe_srq_next_seg *next;
        struct mlx5_wqe_data_seg *scat;
+       struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+       struct mlx5_core_dev *mdev = dev->mdev;
        unsigned long flags;
        int err = 0;
        int nreq;
@@ -465,6 +445,12 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 
        spin_lock_irqsave(&srq->lock, flags);
 
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               err = -EIO;
+               *bad_wr = wr;
+               goto out;
+       }
+
        for (nreq = 0; wr; nreq++, wr = wr->next) {
                if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
                        err = -EINVAL;
@@ -507,7 +493,7 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 
                *srq->db.db = cpu_to_be32(srq->wqe_ctr);
        }
-
+out:
        spin_unlock_irqrestore(&srq->lock, flags);
 
        return err;
index 61bc308..188dac4 100644 (file)
@@ -46,6 +46,10 @@ enum {
        MLX5_SRQ_FLAG_SIGNATURE         = 1 << 0,
 };
 
+enum {
+       MLX5_WQ_FLAG_SIGNATURE          = 1 << 0,
+};
+
 
 /* Increment this value if any changes that break userspace ABI
  * compatibility are made.
@@ -79,6 +83,10 @@ enum mlx5_ib_alloc_ucontext_resp_mask {
        MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
 };
 
+enum mlx5_user_cmds_supp_uhw {
+       MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0,
+};
+
 struct mlx5_ib_alloc_ucontext_resp {
        __u32   qp_tab_size;
        __u32   bf_reg_size;
@@ -94,8 +102,8 @@ struct mlx5_ib_alloc_ucontext_resp {
        __u32   comp_mask;
        __u32   response_length;
        __u8    cqe_version;
-       __u8    reserved2;
-       __u16   reserved3;
+       __u8    cmds_supp_uhw;
+       __u16   reserved2;
        __u64   hca_core_clock_offset;
 };
 
@@ -103,6 +111,22 @@ struct mlx5_ib_alloc_pd_resp {
        __u32   pdn;
 };
 
+struct mlx5_ib_tso_caps {
+       __u32 max_tso; /* Maximum tso payload size in bytes */
+
+       /* Corresponding bit will be set if qp type from
+        * 'enum ib_qp_type' is supported, e.g.
+        * supported_qpts |= 1 << IB_QPT_UD
+        */
+       __u32 supported_qpts;
+};
+
+struct mlx5_ib_query_device_resp {
+       __u32   comp_mask;
+       __u32   response_length;
+       struct  mlx5_ib_tso_caps tso_caps;
+};
+
 struct mlx5_ib_create_cq {
        __u64   buf_addr;
        __u64   db_addr;
@@ -148,6 +172,40 @@ struct mlx5_ib_create_qp {
        __u64   sq_buf_addr;
 };
 
+/* RX Hash function flags */
+enum mlx5_rx_hash_function_flags {
+       MLX5_RX_HASH_FUNC_TOEPLITZ      = 1 << 0,
+};
+
+/*
+ * RX Hash flags, these flags allows to set which incoming packet's field should
+ * participates in RX Hash. Each flag represent certain packet's field,
+ * when the flag is set the field that is represented by the flag will
+ * participate in RX Hash calculation.
+ * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP
+ * and *TCP and *UDP flags can't be enabled together on the same QP.
+*/
+enum mlx5_rx_hash_fields {
+       MLX5_RX_HASH_SRC_IPV4   = 1 << 0,
+       MLX5_RX_HASH_DST_IPV4   = 1 << 1,
+       MLX5_RX_HASH_SRC_IPV6   = 1 << 2,
+       MLX5_RX_HASH_DST_IPV6   = 1 << 3,
+       MLX5_RX_HASH_SRC_PORT_TCP       = 1 << 4,
+       MLX5_RX_HASH_DST_PORT_TCP       = 1 << 5,
+       MLX5_RX_HASH_SRC_PORT_UDP       = 1 << 6,
+       MLX5_RX_HASH_DST_PORT_UDP       = 1 << 7
+};
+
+struct mlx5_ib_create_qp_rss {
+       __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */
+       __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */
+       __u8 rx_key_len; /* valid only for Toeplitz */
+       __u8 reserved[6];
+       __u8 rx_hash_key[128]; /* valid only for Toeplitz */
+       __u32   comp_mask;
+       __u32   reserved1;
+};
+
 struct mlx5_ib_create_qp_resp {
        __u32   uuar_index;
 };
@@ -159,6 +217,32 @@ struct mlx5_ib_alloc_mw {
        __u16   reserved2;
 };
 
+struct mlx5_ib_create_wq {
+       __u64   buf_addr;
+       __u64   db_addr;
+       __u32   rq_wqe_count;
+       __u32   rq_wqe_shift;
+       __u32   user_index;
+       __u32   flags;
+       __u32   comp_mask;
+       __u32   reserved;
+};
+
+struct mlx5_ib_create_wq_resp {
+       __u32   response_length;
+       __u32   reserved;
+};
+
+struct mlx5_ib_create_rwq_ind_tbl_resp {
+       __u32   response_length;
+       __u32   reserved;
+};
+
+struct mlx5_ib_modify_wq {
+       __u32   comp_mask;
+       __u32   reserved;
+};
+
 static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
                                    struct mlx5_ib_create_qp *ucmd,
                                    int inlen,
index 9866c35..da2335f 100644 (file)
@@ -1081,16 +1081,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr,
        return sprintf(buf, "%x\n", dev->rev_id);
 }
 
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                          char *buf)
-{
-       struct mthca_dev *dev =
-               container_of(device, struct mthca_dev, ib_dev.dev);
-       return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32),
-                      (int) (dev->fw_ver >> 16) & 0xffff,
-                      (int) dev->fw_ver & 0xffff);
-}
-
 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
                        char *buf)
 {
@@ -1120,13 +1110,11 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
 }
 
 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
 
 static struct device_attribute *mthca_dev_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type,
        &dev_attr_board_id
 };
@@ -1187,6 +1175,17 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void get_dev_fw_str(struct ib_device *device, char *str,
+                          size_t str_len)
+{
+       struct mthca_dev *dev =
+               container_of(device, struct mthca_dev, ib_dev);
+       snprintf(str, str_len, "%d.%d.%d",
+                (int) (dev->fw_ver >> 32),
+                (int) (dev->fw_ver >> 16) & 0xffff,
+                (int) dev->fw_ver & 0xffff);
+}
+
 int mthca_register_device(struct mthca_dev *dev)
 {
        int ret;
@@ -1266,6 +1265,7 @@ int mthca_register_device(struct mthca_dev *dev)
        dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
        dev->ib_dev.dereg_mr             = mthca_dereg_mr;
        dev->ib_dev.get_port_immutable   = mthca_port_immutable;
+       dev->ib_dev.get_dev_fw_str       = get_dev_fw_str;
 
        if (dev->mthca_flags & MTHCA_FLAG_FMR) {
                dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
index 74c6a94..6727af2 100644 (file)
@@ -98,7 +98,7 @@ int mthca_reset(struct mthca_dev *mdev)
                err = -ENOMEM;
                mthca_err(mdev, "Couldn't allocate memory to save HCA "
                          "PCI header, aborting.\n");
-               goto out;
+               goto put_dev;
        }
 
        for (i = 0; i < 64; ++i) {
@@ -108,7 +108,7 @@ int mthca_reset(struct mthca_dev *mdev)
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't save HCA "
                                  "PCI header, aborting.\n");
-                       goto out;
+                       goto free_hca;
                }
        }
 
@@ -121,7 +121,7 @@ int mthca_reset(struct mthca_dev *mdev)
                        err = -ENOMEM;
                        mthca_err(mdev, "Couldn't allocate memory to save HCA "
                                  "bridge PCI header, aborting.\n");
-                       goto out;
+                       goto free_hca;
                }
 
                for (i = 0; i < 64; ++i) {
@@ -131,7 +131,7 @@ int mthca_reset(struct mthca_dev *mdev)
                                err = -ENODEV;
                                mthca_err(mdev, "Couldn't save HCA bridge "
                                          "PCI header, aborting.\n");
-                               goto out;
+                               goto free_bh;
                        }
                }
                bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
@@ -139,7 +139,7 @@ int mthca_reset(struct mthca_dev *mdev)
                                err = -ENODEV;
                                mthca_err(mdev, "Couldn't locate HCA bridge "
                                          "PCI-X capability, aborting.\n");
-                               goto out;
+                               goto free_bh;
                }
        }
 
@@ -152,7 +152,7 @@ int mthca_reset(struct mthca_dev *mdev)
                        err = -ENOMEM;
                        mthca_err(mdev, "Couldn't map HCA reset register, "
                                  "aborting.\n");
-                       goto out;
+                       goto free_bh;
                }
 
                writel(MTHCA_RESET_VALUE, reset);
@@ -172,7 +172,7 @@ int mthca_reset(struct mthca_dev *mdev)
                                err = -ENODEV;
                                mthca_err(mdev, "Couldn't access HCA after reset, "
                                          "aborting.\n");
-                               goto out;
+                               goto free_bh;
                        }
 
                        if (v != 0xffffffff)
@@ -184,7 +184,7 @@ int mthca_reset(struct mthca_dev *mdev)
                err = -ENODEV;
                mthca_err(mdev, "PCI device did not come back after reset, "
                          "aborting.\n");
-               goto out;
+               goto free_bh;
        }
 
 good:
@@ -195,14 +195,14 @@ good:
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
                                  "split transaction control, aborting.\n");
-                       goto out;
+                       goto free_bh;
                }
                if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
                                 bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
                                  "split transaction control, aborting.\n");
-                       goto out;
+                       goto free_bh;
                }
                /*
                 * Bridge control register is at 0x3e, so we'll
@@ -216,7 +216,7 @@ good:
                                err = -ENODEV;
                                mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
                                          "aborting.\n", i);
-                               goto out;
+                               goto free_bh;
                        }
                }
 
@@ -225,7 +225,7 @@ good:
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
                                  "aborting.\n");
-                       goto out;
+                       goto free_bh;
                }
        }
 
@@ -235,7 +235,7 @@ good:
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't restore HCA PCI-X "
                                  "command register, aborting.\n");
-                       goto out;
+                       goto free_bh;
                }
        }
 
@@ -246,7 +246,7 @@ good:
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't restore HCA PCI Express "
                                  "Device Control register, aborting.\n");
-                       goto out;
+                       goto free_bh;
                }
                linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
                if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL,
@@ -254,7 +254,7 @@ good:
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't restore HCA PCI Express "
                                  "Link control register, aborting.\n");
-                       goto out;
+                       goto free_bh;
                }
        }
 
@@ -266,7 +266,7 @@ good:
                        err = -ENODEV;
                        mthca_err(mdev, "Couldn't restore HCA reg %x, "
                                  "aborting.\n", i);
-                       goto out;
+                       goto free_bh;
                }
        }
 
@@ -275,14 +275,12 @@ good:
                err = -ENODEV;
                mthca_err(mdev, "Couldn't restore HCA COMMAND, "
                          "aborting.\n");
-               goto out;
        }
-
-out:
-       if (bridge)
-               pci_dev_put(bridge);
+free_bh:
        kfree(bridge_header);
+free_hca:
        kfree(hca_header);
-
+put_dev:
+       pci_dev_put(bridge);
        return err;
 }
index 464d6da..bd69125 100644 (file)
@@ -2605,23 +2605,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
 }
 
 
-/**
- * show_fw_ver
- */
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
-                          char *buf)
-{
-       struct nes_ib_device *nesibdev =
-                       container_of(dev, struct nes_ib_device, ibdev.dev);
-       struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
-       nes_debug(NES_DBG_INIT, "\n");
-       return sprintf(buf, "%u.%u\n",
-               (nesvnic->nesdev->nesadapter->firmware_version >> 16),
-               (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
-}
-
-
 /**
  * show_hca
  */
@@ -2645,13 +2628,11 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
 
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
 
 static struct device_attribute *nes_dev_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type,
        &dev_attr_board_id
 };
@@ -3703,6 +3684,19 @@ static int nes_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void get_dev_fw_str(struct ib_device *dev, char *str,
+                          size_t str_len)
+{
+       struct nes_ib_device *nesibdev =
+                       container_of(dev, struct nes_ib_device, ibdev);
+       struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+       nes_debug(NES_DBG_INIT, "\n");
+       snprintf(str, str_len, "%u.%u",
+                (nesvnic->nesdev->nesadapter->firmware_version >> 16),
+                (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
+}
+
 /**
  * nes_init_ofa_device
  */
@@ -3802,6 +3796,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
        nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
        nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
        nesibdev->ibdev.get_port_immutable   = nes_port_immutable;
+       nesibdev->ibdev.get_dev_fw_str   = get_dev_fw_str;
        memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name,
               sizeof(nesibdev->ibdev.iwcm->ifname));
 
index 3d75f65..07d0c6c 100644 (file)
@@ -107,6 +107,14 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void get_dev_fw_str(struct ib_device *device, char *str,
+                          size_t str_len)
+{
+       struct ocrdma_dev *dev = get_ocrdma_dev(device);
+
+       snprintf(str, str_len, "%s", &dev->attr.fw_ver[0]);
+}
+
 static int ocrdma_register_device(struct ocrdma_dev *dev)
 {
        strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX);
@@ -193,6 +201,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 
        dev->ibdev.process_mad = ocrdma_process_mad;
        dev->ibdev.get_port_immutable = ocrdma_port_immutable;
+       dev->ibdev.get_dev_fw_str     = get_dev_fw_str;
 
        if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
                dev->ibdev.uverbs_cmd_mask |=
@@ -262,14 +271,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr,
        return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
 }
 
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct ocrdma_dev *dev = dev_get_drvdata(device);
-
-       return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]);
-}
-
 static ssize_t show_hca_type(struct device *device,
                             struct device_attribute *attr, char *buf)
 {
@@ -279,12 +280,10 @@ static ssize_t show_hca_type(struct device *device,
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
 
 static struct device_attribute *ocrdma_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
        &dev_attr_hca_type
 };
 
index 565c881..c229b9f 100644 (file)
@@ -331,6 +331,21 @@ static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static void usnic_get_dev_fw_str(struct ib_device *device,
+                                char *str,
+                                size_t str_len)
+{
+       struct usnic_ib_dev *us_ibdev =
+               container_of(device, struct usnic_ib_dev, ib_dev);
+       struct ethtool_drvinfo info;
+
+       mutex_lock(&us_ibdev->usdev_lock);
+       us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+       mutex_unlock(&us_ibdev->usdev_lock);
+
+       snprintf(str, str_len, "%s", info.fw_version);
+}
+
 /* Start of PF discovery section */
 static void *usnic_ib_device_add(struct pci_dev *dev)
 {
@@ -414,6 +429,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
        us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
        us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
        us_ibdev->ib_dev.get_port_immutable = usnic_port_immutable;
+       us_ibdev->ib_dev.get_dev_fw_str     = usnic_get_dev_fw_str;
 
 
        if (ib_register_device(&us_ibdev->ib_dev, NULL))
index 3412ea0..80ef3f8 100644 (file)
 #include "usnic_ib_verbs.h"
 #include "usnic_log.h"
 
-static ssize_t usnic_ib_show_fw_ver(struct device *device,
-                                       struct device_attribute *attr,
-                                       char *buf)
-{
-       struct usnic_ib_dev *us_ibdev =
-               container_of(device, struct usnic_ib_dev, ib_dev.dev);
-       struct ethtool_drvinfo info;
-
-       mutex_lock(&us_ibdev->usdev_lock);
-       us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
-       mutex_unlock(&us_ibdev->usdev_lock);
-
-       return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version);
-}
-
 static ssize_t usnic_ib_show_board(struct device *device,
                                        struct device_attribute *attr,
                                        char *buf)
@@ -192,7 +177,6 @@ usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr,
                        us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
 }
 
-static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL);
 static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL);
 static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL);
@@ -201,7 +185,6 @@ static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL);
 static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL);
 
 static struct device_attribute *usnic_class_attributes[] = {
-       &dev_attr_fw_ver,
        &dev_attr_board_id,
        &dev_attr_config,
        &dev_attr_iface,
index 988b6a0..8b095b2 100644 (file)
@@ -1 +1,2 @@
 obj-$(CONFIG_INFINIBAND_RDMAVT)                += rdmavt/
+obj-$(CONFIG_RDMA_RXE)                 += rxe/
index 11aa6a3..1da8d01 100644 (file)
@@ -1,6 +1,5 @@
 config INFINIBAND_RDMAVT
        tristate "RDMA verbs transport library"
        depends on 64BIT
-       default m
        ---help---
        This is a common software verbs provider for RDMA networks.
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig
new file mode 100644 (file)
index 0000000..1e4e628
--- /dev/null
@@ -0,0 +1,24 @@
+config RDMA_RXE
+       tristate "Software RDMA over Ethernet (RoCE) driver"
+       depends on INET && PCI && INFINIBAND
+       depends on NET_UDP_TUNNEL
+       ---help---
+       This driver implements the InfiniBand RDMA transport over
+       the Linux network stack. It enables a system with a
+       standard Ethernet adapter to interoperate with a RoCE
+       adapter or with another system running the RXE driver.
+       Documentation on InfiniBand and RoCE can be downloaded at
+       www.infinibandta.org and www.openfabrics.org. (See also
+       siw which is a similar software driver for iWARP.)
+
+       The driver is split into two layers, one interfaces with the
+       Linux RDMA stack and implements a kernel or user space
+       verbs API. The user space verbs API requires a support
+       library named librxe which is loaded by the generic user
+       space verbs API, libibverbs. The other layer interfaces
+       with the Linux network stack at layer 3.
+
+       To configure and work with soft-RoCE driver please use the
+       following wiki page under "configure Soft-RoCE (RXE)" section:
+
+       https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home
diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile
new file mode 100644 (file)
index 0000000..3b3fb9d
--- /dev/null
@@ -0,0 +1,24 @@
+obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o
+
+rdma_rxe-y := \
+       rxe.o \
+       rxe_comp.o \
+       rxe_req.o \
+       rxe_resp.o \
+       rxe_recv.o \
+       rxe_pool.o \
+       rxe_queue.o \
+       rxe_verbs.o \
+       rxe_av.o \
+       rxe_srq.o \
+       rxe_qp.o \
+       rxe_cq.o \
+       rxe_mr.o \
+       rxe_dma.o \
+       rxe_opcode.o \
+       rxe_mmap.o \
+       rxe_icrc.o \
+       rxe_mcast.o \
+       rxe_task.o \
+       rxe_net.o \
+       rxe_sysfs.o
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
new file mode 100644 (file)
index 0000000..55f0e8f
--- /dev/null
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib");
+MODULE_DESCRIPTION("Soft RDMA transport");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.2");
+
+/* free resources for all ports on a device */
+static void rxe_cleanup_ports(struct rxe_dev *rxe)
+{
+       kfree(rxe->port.pkey_tbl);
+       rxe->port.pkey_tbl = NULL;
+
+}
+
+/* free resources for a rxe device all objects created for this device must
+ * have been destroyed
+ */
+static void rxe_cleanup(struct rxe_dev *rxe)
+{
+       rxe_pool_cleanup(&rxe->uc_pool);
+       rxe_pool_cleanup(&rxe->pd_pool);
+       rxe_pool_cleanup(&rxe->ah_pool);
+       rxe_pool_cleanup(&rxe->srq_pool);
+       rxe_pool_cleanup(&rxe->qp_pool);
+       rxe_pool_cleanup(&rxe->cq_pool);
+       rxe_pool_cleanup(&rxe->mr_pool);
+       rxe_pool_cleanup(&rxe->mw_pool);
+       rxe_pool_cleanup(&rxe->mc_grp_pool);
+       rxe_pool_cleanup(&rxe->mc_elem_pool);
+
+       rxe_cleanup_ports(rxe);
+}
+
+/* called when all references have been dropped */
+void rxe_release(struct kref *kref)
+{
+       struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
+
+       rxe_cleanup(rxe);
+       ib_dealloc_device(&rxe->ib_dev);
+}
+
+void rxe_dev_put(struct rxe_dev *rxe)
+{
+       kref_put(&rxe->ref_cnt, rxe_release);
+}
+EXPORT_SYMBOL_GPL(rxe_dev_put);
+
+/* initialize rxe device parameters */
+static int rxe_init_device_param(struct rxe_dev *rxe)
+{
+       rxe->max_inline_data                    = RXE_MAX_INLINE_DATA;
+
+       rxe->attr.fw_ver                        = RXE_FW_VER;
+       rxe->attr.max_mr_size                   = RXE_MAX_MR_SIZE;
+       rxe->attr.page_size_cap                 = RXE_PAGE_SIZE_CAP;
+       rxe->attr.vendor_id                     = RXE_VENDOR_ID;
+       rxe->attr.vendor_part_id                = RXE_VENDOR_PART_ID;
+       rxe->attr.hw_ver                        = RXE_HW_VER;
+       rxe->attr.max_qp                        = RXE_MAX_QP;
+       rxe->attr.max_qp_wr                     = RXE_MAX_QP_WR;
+       rxe->attr.device_cap_flags              = RXE_DEVICE_CAP_FLAGS;
+       rxe->attr.max_sge                       = RXE_MAX_SGE;
+       rxe->attr.max_sge_rd                    = RXE_MAX_SGE_RD;
+       rxe->attr.max_cq                        = RXE_MAX_CQ;
+       rxe->attr.max_cqe                       = (1 << RXE_MAX_LOG_CQE) - 1;
+       rxe->attr.max_mr                        = RXE_MAX_MR;
+       rxe->attr.max_pd                        = RXE_MAX_PD;
+       rxe->attr.max_qp_rd_atom                = RXE_MAX_QP_RD_ATOM;
+       rxe->attr.max_ee_rd_atom                = RXE_MAX_EE_RD_ATOM;
+       rxe->attr.max_res_rd_atom               = RXE_MAX_RES_RD_ATOM;
+       rxe->attr.max_qp_init_rd_atom           = RXE_MAX_QP_INIT_RD_ATOM;
+       rxe->attr.max_ee_init_rd_atom           = RXE_MAX_EE_INIT_RD_ATOM;
+       rxe->attr.atomic_cap                    = RXE_ATOMIC_CAP;
+       rxe->attr.max_ee                        = RXE_MAX_EE;
+       rxe->attr.max_rdd                       = RXE_MAX_RDD;
+       rxe->attr.max_mw                        = RXE_MAX_MW;
+       rxe->attr.max_raw_ipv6_qp               = RXE_MAX_RAW_IPV6_QP;
+       rxe->attr.max_raw_ethy_qp               = RXE_MAX_RAW_ETHY_QP;
+       rxe->attr.max_mcast_grp                 = RXE_MAX_MCAST_GRP;
+       rxe->attr.max_mcast_qp_attach           = RXE_MAX_MCAST_QP_ATTACH;
+       rxe->attr.max_total_mcast_qp_attach     = RXE_MAX_TOT_MCAST_QP_ATTACH;
+       rxe->attr.max_ah                        = RXE_MAX_AH;
+       rxe->attr.max_fmr                       = RXE_MAX_FMR;
+       rxe->attr.max_map_per_fmr               = RXE_MAX_MAP_PER_FMR;
+       rxe->attr.max_srq                       = RXE_MAX_SRQ;
+       rxe->attr.max_srq_wr                    = RXE_MAX_SRQ_WR;
+       rxe->attr.max_srq_sge                   = RXE_MAX_SRQ_SGE;
+       rxe->attr.max_fast_reg_page_list_len    = RXE_MAX_FMR_PAGE_LIST_LEN;
+       rxe->attr.max_pkeys                     = RXE_MAX_PKEYS;
+       rxe->attr.local_ca_ack_delay            = RXE_LOCAL_CA_ACK_DELAY;
+
+       rxe->max_ucontext                       = RXE_MAX_UCONTEXT;
+
+       return 0;
+}
+
+/* initialize port attributes */
+static int rxe_init_port_param(struct rxe_port *port)
+{
+       port->attr.state                = RXE_PORT_STATE;
+       port->attr.max_mtu              = RXE_PORT_MAX_MTU;
+       port->attr.active_mtu           = RXE_PORT_ACTIVE_MTU;
+       port->attr.gid_tbl_len          = RXE_PORT_GID_TBL_LEN;
+       port->attr.port_cap_flags       = RXE_PORT_PORT_CAP_FLAGS;
+       port->attr.max_msg_sz           = RXE_PORT_MAX_MSG_SZ;
+       port->attr.bad_pkey_cntr        = RXE_PORT_BAD_PKEY_CNTR;
+       port->attr.qkey_viol_cntr       = RXE_PORT_QKEY_VIOL_CNTR;
+       port->attr.pkey_tbl_len         = RXE_PORT_PKEY_TBL_LEN;
+       port->attr.lid                  = RXE_PORT_LID;
+       port->attr.sm_lid               = RXE_PORT_SM_LID;
+       port->attr.lmc                  = RXE_PORT_LMC;
+       port->attr.max_vl_num           = RXE_PORT_MAX_VL_NUM;
+       port->attr.sm_sl                = RXE_PORT_SM_SL;
+       port->attr.subnet_timeout       = RXE_PORT_SUBNET_TIMEOUT;
+       port->attr.init_type_reply      = RXE_PORT_INIT_TYPE_REPLY;
+       port->attr.active_width         = RXE_PORT_ACTIVE_WIDTH;
+       port->attr.active_speed         = RXE_PORT_ACTIVE_SPEED;
+       port->attr.phys_state           = RXE_PORT_PHYS_STATE;
+       port->mtu_cap                   =
+                               ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU);
+       port->subnet_prefix             = cpu_to_be64(RXE_PORT_SUBNET_PREFIX);
+
+       return 0;
+}
+
+/* initialize port state, note IB convention that HCA ports are always
+ * numbered from 1
+ */
+static int rxe_init_ports(struct rxe_dev *rxe)
+{
+       struct rxe_port *port = &rxe->port;
+
+       rxe_init_port_param(port);
+
+       if (!port->attr.pkey_tbl_len || !port->attr.gid_tbl_len)
+               return -EINVAL;
+
+       port->pkey_tbl = kcalloc(port->attr.pkey_tbl_len,
+                       sizeof(*port->pkey_tbl), GFP_KERNEL);
+
+       if (!port->pkey_tbl)
+               return -ENOMEM;
+
+       port->pkey_tbl[0] = 0xffff;
+       port->port_guid = rxe->ifc_ops->port_guid(rxe);
+
+       spin_lock_init(&port->port_lock);
+
+       return 0;
+}
+
+/* init pools of managed objects */
+static int rxe_init_pools(struct rxe_dev *rxe)
+{
+       int err;
+
+       err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC,
+                           rxe->max_ucontext);
+       if (err)
+               goto err1;
+
+       err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD,
+                           rxe->attr.max_pd);
+       if (err)
+               goto err2;
+
+       err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH,
+                           rxe->attr.max_ah);
+       if (err)
+               goto err3;
+
+       err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ,
+                           rxe->attr.max_srq);
+       if (err)
+               goto err4;
+
+       err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP,
+                           rxe->attr.max_qp);
+       if (err)
+               goto err5;
+
+       err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ,
+                           rxe->attr.max_cq);
+       if (err)
+               goto err6;
+
+       err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR,
+                           rxe->attr.max_mr);
+       if (err)
+               goto err7;
+
+       err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW,
+                           rxe->attr.max_mw);
+       if (err)
+               goto err8;
+
+       err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP,
+                           rxe->attr.max_mcast_grp);
+       if (err)
+               goto err9;
+
+       err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM,
+                           rxe->attr.max_total_mcast_qp_attach);
+       if (err)
+               goto err10;
+
+       return 0;
+
+err10:
+       rxe_pool_cleanup(&rxe->mc_grp_pool);
+err9:
+       rxe_pool_cleanup(&rxe->mw_pool);
+err8:
+       rxe_pool_cleanup(&rxe->mr_pool);
+err7:
+       rxe_pool_cleanup(&rxe->cq_pool);
+err6:
+       rxe_pool_cleanup(&rxe->qp_pool);
+err5:
+       rxe_pool_cleanup(&rxe->srq_pool);
+err4:
+       rxe_pool_cleanup(&rxe->ah_pool);
+err3:
+       rxe_pool_cleanup(&rxe->pd_pool);
+err2:
+       rxe_pool_cleanup(&rxe->uc_pool);
+err1:
+       return err;
+}
+
+/* initialize rxe device state */
+static int rxe_init(struct rxe_dev *rxe)
+{
+       int err;
+
+       /* init default device parameters */
+       rxe_init_device_param(rxe);
+
+       err = rxe_init_ports(rxe);
+       if (err)
+               goto err1;
+
+       err = rxe_init_pools(rxe);
+       if (err)
+               goto err2;
+
+       /* init pending mmap list */
+       spin_lock_init(&rxe->mmap_offset_lock);
+       spin_lock_init(&rxe->pending_lock);
+       INIT_LIST_HEAD(&rxe->pending_mmaps);
+       INIT_LIST_HEAD(&rxe->list);
+
+       mutex_init(&rxe->usdev_lock);
+
+       return 0;
+
+err2:
+       rxe_cleanup_ports(rxe);
+err1:
+       return err;
+}
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
+{
+       struct rxe_port *port = &rxe->port;
+       enum ib_mtu mtu;
+
+       mtu = eth_mtu_int_to_enum(ndev_mtu);
+
+       /* Make sure that new MTU in range */
+       mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256;
+
+       port->attr.active_mtu = mtu;
+       port->mtu_cap = ib_mtu_enum_to_int(mtu);
+
+       return 0;
+}
+EXPORT_SYMBOL(rxe_set_mtu);
+
+/* called by ifc layer to create new rxe device.
+ * The caller should allocate memory for rxe by calling ib_alloc_device.
+ */
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
+{
+       int err;
+
+       kref_init(&rxe->ref_cnt);
+
+       err = rxe_init(rxe);
+       if (err)
+               goto err1;
+
+       err = rxe_set_mtu(rxe, mtu);
+       if (err)
+               goto err1;
+
+       err = rxe_register_device(rxe);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       rxe_dev_put(rxe);
+       return err;
+}
+EXPORT_SYMBOL(rxe_add);
+
+/* called by the ifc layer to remove a device */
+void rxe_remove(struct rxe_dev *rxe)
+{
+       rxe_unregister_device(rxe);
+
+       rxe_dev_put(rxe);
+}
+EXPORT_SYMBOL(rxe_remove);
+
+static int __init rxe_module_init(void)
+{
+       int err;
+
+       /* initialize slab caches for managed objects */
+       err = rxe_cache_init();
+       if (err) {
+               pr_err("rxe: unable to init object pools\n");
+               return err;
+       }
+
+       err = rxe_net_init();
+       if (err) {
+               pr_err("rxe: unable to init\n");
+               rxe_cache_exit();
+               return err;
+       }
+       pr_info("rxe: loaded\n");
+
+       return 0;
+}
+
+static void __exit rxe_module_exit(void)
+{
+       rxe_remove_all();
+       rxe_net_exit();
+       rxe_cache_exit();
+
+       pr_info("rxe: unloaded\n");
+}
+
+module_init(rxe_module_init);
+module_exit(rxe_module_exit);
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
new file mode 100644 (file)
index 0000000..12c71c5
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_H
+#define RXE_H
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/crc32.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe_net.h"
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+#include "rxe_param.h"
+#include "rxe_verbs.h"
+
+#define RXE_UVERBS_ABI_VERSION         (1)
+
+#define IB_PHYS_STATE_LINK_UP          (5)
+#define IB_PHYS_STATE_LINK_DOWN                (3)
+
+#define RXE_ROCE_V2_SPORT              (0xc000)
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
+
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
+void rxe_remove(struct rxe_dev *rxe);
+void rxe_remove_all(void);
+
+int rxe_rcv(struct sk_buff *skb);
+
+void rxe_dev_put(struct rxe_dev *rxe);
+struct rxe_dev *net_to_rxe(struct net_device *ndev);
+struct rxe_dev *get_rxe_by_name(const char* name);
+
+void rxe_port_up(struct rxe_dev *rxe);
+void rxe_port_down(struct rxe_dev *rxe);
+
+#endif /* RXE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
new file mode 100644 (file)
index 0000000..5c94742
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr)
+{
+       struct rxe_port *port;
+
+       if (attr->port_num != 1) {
+               pr_info("rxe: invalid port_num = %d\n", attr->port_num);
+               return -EINVAL;
+       }
+
+       port = &rxe->port;
+
+       if (attr->ah_flags & IB_AH_GRH) {
+               if (attr->grh.sgid_index > port->attr.gid_tbl_len) {
+                       pr_info("rxe: invalid sgid index = %d\n",
+                               attr->grh.sgid_index);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num,
+                    struct rxe_av *av, struct ib_ah_attr *attr)
+{
+       memset(av, 0, sizeof(*av));
+       memcpy(&av->grh, &attr->grh, sizeof(attr->grh));
+       av->port_num = port_num;
+       return 0;
+}
+
+int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av,
+                  struct ib_ah_attr *attr)
+{
+       memcpy(&attr->grh, &av->grh, sizeof(av->grh));
+       attr->port_num = av->port_num;
+       return 0;
+}
+
+int rxe_av_fill_ip_info(struct rxe_dev *rxe,
+                       struct rxe_av *av,
+                       struct ib_ah_attr *attr,
+                       struct ib_gid_attr *sgid_attr,
+                       union ib_gid *sgid)
+{
+       rdma_gid2ip(&av->sgid_addr._sockaddr, sgid);
+       rdma_gid2ip(&av->dgid_addr._sockaddr, &attr->grh.dgid);
+       av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid);
+
+       return 0;
+}
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt)
+{
+       if (!pkt || !pkt->qp)
+               return NULL;
+
+       if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
+               return &pkt->qp->pri_av;
+
+       return (pkt->wqe) ? &pkt->wqe->av : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
new file mode 100644 (file)
index 0000000..36f67de
--- /dev/null
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+enum comp_state {
+       COMPST_GET_ACK,
+       COMPST_GET_WQE,
+       COMPST_COMP_WQE,
+       COMPST_COMP_ACK,
+       COMPST_CHECK_PSN,
+       COMPST_CHECK_ACK,
+       COMPST_READ,
+       COMPST_ATOMIC,
+       COMPST_WRITE_SEND,
+       COMPST_UPDATE_COMP,
+       COMPST_ERROR_RETRY,
+       COMPST_RNR_RETRY,
+       COMPST_ERROR,
+       COMPST_EXIT, /* We have an issue, and we want to rerun the completer */
+       COMPST_DONE, /* The completer finished successflly */
+};
+
+static char *comp_state_name[] =  {
+       [COMPST_GET_ACK]                = "GET ACK",
+       [COMPST_GET_WQE]                = "GET WQE",
+       [COMPST_COMP_WQE]               = "COMP WQE",
+       [COMPST_COMP_ACK]               = "COMP ACK",
+       [COMPST_CHECK_PSN]              = "CHECK PSN",
+       [COMPST_CHECK_ACK]              = "CHECK ACK",
+       [COMPST_READ]                   = "READ",
+       [COMPST_ATOMIC]                 = "ATOMIC",
+       [COMPST_WRITE_SEND]             = "WRITE/SEND",
+       [COMPST_UPDATE_COMP]            = "UPDATE COMP",
+       [COMPST_ERROR_RETRY]            = "ERROR RETRY",
+       [COMPST_RNR_RETRY]              = "RNR RETRY",
+       [COMPST_ERROR]                  = "ERROR",
+       [COMPST_EXIT]                   = "EXIT",
+       [COMPST_DONE]                   = "DONE",
+};
+
+static unsigned long rnrnak_usec[32] = {
+       [IB_RNR_TIMER_655_36] = 655360,
+       [IB_RNR_TIMER_000_01] = 10,
+       [IB_RNR_TIMER_000_02] = 20,
+       [IB_RNR_TIMER_000_03] = 30,
+       [IB_RNR_TIMER_000_04] = 40,
+       [IB_RNR_TIMER_000_06] = 60,
+       [IB_RNR_TIMER_000_08] = 80,
+       [IB_RNR_TIMER_000_12] = 120,
+       [IB_RNR_TIMER_000_16] = 160,
+       [IB_RNR_TIMER_000_24] = 240,
+       [IB_RNR_TIMER_000_32] = 320,
+       [IB_RNR_TIMER_000_48] = 480,
+       [IB_RNR_TIMER_000_64] = 640,
+       [IB_RNR_TIMER_000_96] = 960,
+       [IB_RNR_TIMER_001_28] = 1280,
+       [IB_RNR_TIMER_001_92] = 1920,
+       [IB_RNR_TIMER_002_56] = 2560,
+       [IB_RNR_TIMER_003_84] = 3840,
+       [IB_RNR_TIMER_005_12] = 5120,
+       [IB_RNR_TIMER_007_68] = 7680,
+       [IB_RNR_TIMER_010_24] = 10240,
+       [IB_RNR_TIMER_015_36] = 15360,
+       [IB_RNR_TIMER_020_48] = 20480,
+       [IB_RNR_TIMER_030_72] = 30720,
+       [IB_RNR_TIMER_040_96] = 40960,
+       [IB_RNR_TIMER_061_44] = 61410,
+       [IB_RNR_TIMER_081_92] = 81920,
+       [IB_RNR_TIMER_122_88] = 122880,
+       [IB_RNR_TIMER_163_84] = 163840,
+       [IB_RNR_TIMER_245_76] = 245760,
+       [IB_RNR_TIMER_327_68] = 327680,
+       [IB_RNR_TIMER_491_52] = 491520,
+};
+
+static inline unsigned long rnrnak_jiffies(u8 timeout)
+{
+       return max_t(unsigned long,
+               usecs_to_jiffies(rnrnak_usec[timeout]), 1);
+}
+
+static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+       switch (opcode) {
+       case IB_WR_RDMA_WRITE:                  return IB_WC_RDMA_WRITE;
+       case IB_WR_RDMA_WRITE_WITH_IMM:         return IB_WC_RDMA_WRITE;
+       case IB_WR_SEND:                        return IB_WC_SEND;
+       case IB_WR_SEND_WITH_IMM:               return IB_WC_SEND;
+       case IB_WR_RDMA_READ:                   return IB_WC_RDMA_READ;
+       case IB_WR_ATOMIC_CMP_AND_SWP:          return IB_WC_COMP_SWAP;
+       case IB_WR_ATOMIC_FETCH_AND_ADD:        return IB_WC_FETCH_ADD;
+       case IB_WR_LSO:                         return IB_WC_LSO;
+       case IB_WR_SEND_WITH_INV:               return IB_WC_SEND;
+       case IB_WR_RDMA_READ_WITH_INV:          return IB_WC_RDMA_READ;
+       case IB_WR_LOCAL_INV:                   return IB_WC_LOCAL_INV;
+       case IB_WR_REG_MR:                      return IB_WC_REG_MR;
+
+       default:
+               return 0xff;
+       }
+}
+
+void retransmit_timer(unsigned long data)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)data;
+
+       if (qp->valid) {
+               qp->comp.timeout = 1;
+               rxe_run_task(&qp->comp.task, 1);
+       }
+}
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+                       struct sk_buff *skb)
+{
+       int must_sched;
+
+       skb_queue_tail(&qp->resp_pkts, skb);
+
+       must_sched = skb_queue_len(&qp->resp_pkts) > 1;
+       rxe_run_task(&qp->comp.task, must_sched);
+}
+
+static inline enum comp_state get_wqe(struct rxe_qp *qp,
+                                     struct rxe_pkt_info *pkt,
+                                     struct rxe_send_wqe **wqe_p)
+{
+       struct rxe_send_wqe *wqe;
+
+       /* we come here whether or not we found a response packet to see if
+        * there are any posted WQEs
+        */
+       wqe = queue_head(qp->sq.queue);
+       *wqe_p = wqe;
+
+       /* no WQE or requester has not started it yet */
+       if (!wqe || wqe->state == wqe_state_posted)
+               return pkt ? COMPST_DONE : COMPST_EXIT;
+
+       /* WQE does not require an ack */
+       if (wqe->state == wqe_state_done)
+               return COMPST_COMP_WQE;
+
+       /* WQE caused an error */
+       if (wqe->state == wqe_state_error)
+               return COMPST_ERROR;
+
+       /* we have a WQE, if we also have an ack check its PSN */
+       return pkt ? COMPST_CHECK_PSN : COMPST_EXIT;
+}
+
+static inline void reset_retry_counters(struct rxe_qp *qp)
+{
+       qp->comp.retry_cnt = qp->attr.retry_cnt;
+       qp->comp.rnr_retry = qp->attr.rnr_retry;
+}
+
+static inline enum comp_state check_psn(struct rxe_qp *qp,
+                                       struct rxe_pkt_info *pkt,
+                                       struct rxe_send_wqe *wqe)
+{
+       s32 diff;
+
+       /* check to see if response is past the oldest WQE. if it is, complete
+        * send/write or error read/atomic
+        */
+       diff = psn_compare(pkt->psn, wqe->last_psn);
+       if (diff > 0) {
+               if (wqe->state == wqe_state_pending) {
+                       if (wqe->mask & WR_ATOMIC_OR_READ_MASK)
+                               return COMPST_ERROR_RETRY;
+
+                       reset_retry_counters(qp);
+                       return COMPST_COMP_WQE;
+               } else {
+                       return COMPST_DONE;
+               }
+       }
+
+       /* compare response packet to expected response */
+       diff = psn_compare(pkt->psn, qp->comp.psn);
+       if (diff < 0) {
+               /* response is most likely a retried packet if it matches an
+                * uncompleted WQE go complete it else ignore it
+                */
+               if (pkt->psn == wqe->last_psn)
+                       return COMPST_COMP_ACK;
+               else
+                       return COMPST_DONE;
+       } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) {
+               return COMPST_ERROR_RETRY;
+       } else {
+               return COMPST_CHECK_ACK;
+       }
+}
+
+static inline enum comp_state check_ack(struct rxe_qp *qp,
+                                       struct rxe_pkt_info *pkt,
+                                       struct rxe_send_wqe *wqe)
+{
+       unsigned int mask = pkt->mask;
+       u8 syn;
+
+       /* Check the sequence only */
+       switch (qp->comp.opcode) {
+       case -1:
+               /* Will catch all *_ONLY cases. */
+               if (!(mask & RXE_START_MASK))
+                       return COMPST_ERROR;
+
+               break;
+
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+               if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
+                   pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+                       return COMPST_ERROR;
+               }
+               break;
+       default:
+               WARN_ON(1);
+       }
+
+       /* Check operation validity. */
+       switch (pkt->opcode) {
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+               syn = aeth_syn(pkt);
+
+               if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+                       return COMPST_ERROR;
+
+               /* Fall through (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE
+                * doesn't have an AETH)
+                */
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+               if (wqe->wr.opcode != IB_WR_RDMA_READ &&
+                   wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) {
+                       return COMPST_ERROR;
+               }
+               reset_retry_counters(qp);
+               return COMPST_READ;
+
+       case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE:
+               syn = aeth_syn(pkt);
+
+               if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+                       return COMPST_ERROR;
+
+               if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP &&
+                   wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD)
+                       return COMPST_ERROR;
+               reset_retry_counters(qp);
+               return COMPST_ATOMIC;
+
+       case IB_OPCODE_RC_ACKNOWLEDGE:
+               syn = aeth_syn(pkt);
+               switch (syn & AETH_TYPE_MASK) {
+               case AETH_ACK:
+                       reset_retry_counters(qp);
+                       return COMPST_WRITE_SEND;
+
+               case AETH_RNR_NAK:
+                       return COMPST_RNR_RETRY;
+
+               case AETH_NAK:
+                       switch (syn) {
+                       case AETH_NAK_PSN_SEQ_ERROR:
+                               /* a nak implicitly acks all packets with psns
+                                * before
+                                */
+                               if (psn_compare(pkt->psn, qp->comp.psn) > 0) {
+                                       qp->comp.psn = pkt->psn;
+                                       if (qp->req.wait_psn) {
+                                               qp->req.wait_psn = 0;
+                                               rxe_run_task(&qp->req.task, 1);
+                                       }
+                               }
+                               return COMPST_ERROR_RETRY;
+
+                       case AETH_NAK_INVALID_REQ:
+                               wqe->status = IB_WC_REM_INV_REQ_ERR;
+                               return COMPST_ERROR;
+
+                       case AETH_NAK_REM_ACC_ERR:
+                               wqe->status = IB_WC_REM_ACCESS_ERR;
+                               return COMPST_ERROR;
+
+                       case AETH_NAK_REM_OP_ERR:
+                               wqe->status = IB_WC_REM_OP_ERR;
+                               return COMPST_ERROR;
+
+                       default:
+                               pr_warn("unexpected nak %x\n", syn);
+                               wqe->status = IB_WC_REM_OP_ERR;
+                               return COMPST_ERROR;
+                       }
+
+               default:
+                       return COMPST_ERROR;
+               }
+               break;
+
+       default:
+               pr_warn("unexpected opcode\n");
+       }
+
+       return COMPST_ERROR;
+}
+
+static inline enum comp_state do_read(struct rxe_qp *qp,
+                                     struct rxe_pkt_info *pkt,
+                                     struct rxe_send_wqe *wqe)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       int ret;
+
+       ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+                       &wqe->dma, payload_addr(pkt),
+                       payload_size(pkt), to_mem_obj, NULL);
+       if (ret)
+               return COMPST_ERROR;
+
+       if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK))
+               return COMPST_COMP_ACK;
+       else
+               return COMPST_UPDATE_COMP;
+}
+
+static inline enum comp_state do_atomic(struct rxe_qp *qp,
+                                       struct rxe_pkt_info *pkt,
+                                       struct rxe_send_wqe *wqe)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       int ret;
+
+       u64 atomic_orig = atmack_orig(pkt);
+
+       ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+                       &wqe->dma, &atomic_orig,
+                       sizeof(u64), to_mem_obj, NULL);
+       if (ret)
+               return COMPST_ERROR;
+       else
+               return COMPST_COMP_ACK;
+}
+
+static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                         struct rxe_cqe *cqe)
+{
+       memset(cqe, 0, sizeof(*cqe));
+
+       if (!qp->is_user) {
+               struct ib_wc            *wc     = &cqe->ibwc;
+
+               wc->wr_id               = wqe->wr.wr_id;
+               wc->status              = wqe->status;
+               wc->opcode              = wr_to_wc_opcode(wqe->wr.opcode);
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+                   wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+                       wc->wc_flags = IB_WC_WITH_IMM;
+               wc->byte_len            = wqe->dma.length;
+               wc->qp                  = &qp->ibqp;
+       } else {
+               struct ib_uverbs_wc     *uwc    = &cqe->uibwc;
+
+               uwc->wr_id              = wqe->wr.wr_id;
+               uwc->status             = wqe->status;
+               uwc->opcode             = wr_to_wc_opcode(wqe->wr.opcode);
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+                   wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+                       uwc->wc_flags = IB_WC_WITH_IMM;
+               uwc->byte_len           = wqe->dma.length;
+               uwc->qp_num             = qp->ibqp.qp_num;
+       }
+}
+
+static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+       struct rxe_cqe cqe;
+
+       if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+           (qp->req.state == QP_STATE_ERROR)) {
+               make_send_cqe(qp, wqe, &cqe);
+               rxe_cq_post(qp->scq, &cqe, 0);
+       }
+
+       advance_consumer(qp->sq.queue);
+
+       /*
+        * we completed something so let req run again
+        * if it is trying to fence
+        */
+       if (qp->req.wait_fence) {
+               qp->req.wait_fence = 0;
+               rxe_run_task(&qp->req.task, 1);
+       }
+}
+
+static inline enum comp_state complete_ack(struct rxe_qp *qp,
+                                          struct rxe_pkt_info *pkt,
+                                          struct rxe_send_wqe *wqe)
+{
+       unsigned long flags;
+
+       if (wqe->has_rd_atomic) {
+               wqe->has_rd_atomic = 0;
+               atomic_inc(&qp->req.rd_atomic);
+               if (qp->req.need_rd_atomic) {
+                       qp->comp.timeout_retry = 0;
+                       qp->req.need_rd_atomic = 0;
+                       rxe_run_task(&qp->req.task, 1);
+               }
+       }
+
+       if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+               /* state_lock used by requester & completer */
+               spin_lock_irqsave(&qp->state_lock, flags);
+               if ((qp->req.state == QP_STATE_DRAIN) &&
+                   (qp->comp.psn == qp->req.psn)) {
+                       qp->req.state = QP_STATE_DRAINED;
+                       spin_unlock_irqrestore(&qp->state_lock, flags);
+
+                       if (qp->ibqp.event_handler) {
+                               struct ib_event ev;
+
+                               ev.device = qp->ibqp.device;
+                               ev.element.qp = &qp->ibqp;
+                               ev.event = IB_EVENT_SQ_DRAINED;
+                               qp->ibqp.event_handler(&ev,
+                                       qp->ibqp.qp_context);
+                       }
+               } else {
+                       spin_unlock_irqrestore(&qp->state_lock, flags);
+               }
+       }
+
+       do_complete(qp, wqe);
+
+       if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+               return COMPST_UPDATE_COMP;
+       else
+               return COMPST_DONE;
+}
+
+static inline enum comp_state complete_wqe(struct rxe_qp *qp,
+                                          struct rxe_pkt_info *pkt,
+                                          struct rxe_send_wqe *wqe)
+{
+       qp->comp.opcode = -1;
+
+       if (pkt) {
+               if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+                       qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+               if (qp->req.wait_psn) {
+                       qp->req.wait_psn = 0;
+                       rxe_run_task(&qp->req.task, 1);
+               }
+       }
+
+       do_complete(qp, wqe);
+
+       return COMPST_GET_WQE;
+}
+
+int rxe_completer(void *arg)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)arg;
+       struct rxe_send_wqe *wqe = wqe;
+       struct sk_buff *skb = NULL;
+       struct rxe_pkt_info *pkt = NULL;
+       enum comp_state state;
+
+       if (!qp->valid) {
+               while ((skb = skb_dequeue(&qp->resp_pkts))) {
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+               }
+               skb = NULL;
+               pkt = NULL;
+
+               while (queue_head(qp->sq.queue))
+                       advance_consumer(qp->sq.queue);
+
+               goto exit;
+       }
+
+       if (qp->req.state == QP_STATE_ERROR) {
+               while ((skb = skb_dequeue(&qp->resp_pkts))) {
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+               }
+               skb = NULL;
+               pkt = NULL;
+
+               while ((wqe = queue_head(qp->sq.queue))) {
+                       wqe->status = IB_WC_WR_FLUSH_ERR;
+                       do_complete(qp, wqe);
+               }
+
+               goto exit;
+       }
+
+       if (qp->req.state == QP_STATE_RESET) {
+               while ((skb = skb_dequeue(&qp->resp_pkts))) {
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+               }
+               skb = NULL;
+               pkt = NULL;
+
+               while (queue_head(qp->sq.queue))
+                       advance_consumer(qp->sq.queue);
+
+               goto exit;
+       }
+
+       if (qp->comp.timeout) {
+               qp->comp.timeout_retry = 1;
+               qp->comp.timeout = 0;
+       } else {
+               qp->comp.timeout_retry = 0;
+       }
+
+       if (qp->req.need_retry)
+               goto exit;
+
+       state = COMPST_GET_ACK;
+
+       while (1) {
+               pr_debug("state = %s\n", comp_state_name[state]);
+               switch (state) {
+               case COMPST_GET_ACK:
+                       skb = skb_dequeue(&qp->resp_pkts);
+                       if (skb) {
+                               pkt = SKB_TO_PKT(skb);
+                               qp->comp.timeout_retry = 0;
+                       }
+                       state = COMPST_GET_WQE;
+                       break;
+
+               case COMPST_GET_WQE:
+                       state = get_wqe(qp, pkt, &wqe);
+                       break;
+
+               case COMPST_CHECK_PSN:
+                       state = check_psn(qp, pkt, wqe);
+                       break;
+
+               case COMPST_CHECK_ACK:
+                       state = check_ack(qp, pkt, wqe);
+                       break;
+
+               case COMPST_READ:
+                       state = do_read(qp, pkt, wqe);
+                       break;
+
+               case COMPST_ATOMIC:
+                       state = do_atomic(qp, pkt, wqe);
+                       break;
+
+               case COMPST_WRITE_SEND:
+                       if (wqe->state == wqe_state_pending &&
+                           wqe->last_psn == pkt->psn)
+                               state = COMPST_COMP_ACK;
+                       else
+                               state = COMPST_UPDATE_COMP;
+                       break;
+
+               case COMPST_COMP_ACK:
+                       state = complete_ack(qp, pkt, wqe);
+                       break;
+
+               case COMPST_COMP_WQE:
+                       state = complete_wqe(qp, pkt, wqe);
+                       break;
+
+               case COMPST_UPDATE_COMP:
+                       if (pkt->mask & RXE_END_MASK)
+                               qp->comp.opcode = -1;
+                       else
+                               qp->comp.opcode = pkt->opcode;
+
+                       if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+                               qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+                       if (qp->req.wait_psn) {
+                               qp->req.wait_psn = 0;
+                               rxe_run_task(&qp->req.task, 1);
+                       }
+
+                       state = COMPST_DONE;
+                       break;
+
+               case COMPST_DONE:
+                       if (pkt) {
+                               rxe_drop_ref(pkt->qp);
+                               kfree_skb(skb);
+                       }
+                       goto done;
+
+               case COMPST_EXIT:
+                       if (qp->comp.timeout_retry && wqe) {
+                               state = COMPST_ERROR_RETRY;
+                               break;
+                       }
+
+                       /* re reset the timeout counter if
+                        * (1) QP is type RC
+                        * (2) the QP is alive
+                        * (3) there is a packet sent by the requester that
+                        *     might be acked (we still might get spurious
+                        *     timeouts but try to keep them as few as possible)
+                        * (4) the timeout parameter is set
+                        */
+                       if ((qp_type(qp) == IB_QPT_RC) &&
+                           (qp->req.state == QP_STATE_READY) &&
+                           (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
+                           qp->qp_timeout_jiffies)
+                               mod_timer(&qp->retrans_timer,
+                                         jiffies + qp->qp_timeout_jiffies);
+                       goto exit;
+
+               case COMPST_ERROR_RETRY:
+                       /* we come here if the retry timer fired and we did
+                        * not receive a response packet. try to retry the send
+                        * queue if that makes sense and the limits have not
+                        * been exceeded. remember that some timeouts are
+                        * spurious since we do not reset the timer but kick
+                        * it down the road or let it expire
+                        */
+
+                       /* there is nothing to retry in this case */
+                       if (!wqe || (wqe->state == wqe_state_posted))
+                               goto exit;
+
+                       if (qp->comp.retry_cnt > 0) {
+                               if (qp->comp.retry_cnt != 7)
+                                       qp->comp.retry_cnt--;
+
+                               /* no point in retrying if we have already
+                                * seen the last ack that the requester could
+                                * have caused
+                                */
+                               if (psn_compare(qp->req.psn,
+                                               qp->comp.psn) > 0) {
+                                       /* tell the requester to retry the
+                                        * send send queue next time around
+                                        */
+                                       qp->req.need_retry = 1;
+                                       rxe_run_task(&qp->req.task, 1);
+                               }
+                               goto exit;
+                       } else {
+                               wqe->status = IB_WC_RETRY_EXC_ERR;
+                               state = COMPST_ERROR;
+                       }
+                       break;
+
+               case COMPST_RNR_RETRY:
+                       if (qp->comp.rnr_retry > 0) {
+                               if (qp->comp.rnr_retry != 7)
+                                       qp->comp.rnr_retry--;
+
+                               qp->req.need_retry = 1;
+                               pr_debug("set rnr nak timer\n");
+                               mod_timer(&qp->rnr_nak_timer,
+                                         jiffies + rnrnak_jiffies(aeth_syn(pkt)
+                                               & ~AETH_TYPE_MASK));
+                               goto exit;
+                       } else {
+                               wqe->status = IB_WC_RNR_RETRY_EXC_ERR;
+                               state = COMPST_ERROR;
+                       }
+                       break;
+
+               case COMPST_ERROR:
+                       do_complete(qp, wqe);
+                       rxe_qp_error(qp);
+                       goto exit;
+               }
+       }
+
+exit:
+       /* we come here if we are done with processing and want the task to
+        * exit from the loop calling us
+        */
+       return -EAGAIN;
+
+done:
+       /* we come here if we have processed a packet we want the task to call
+        * us again to see if there is anything else to do
+        */
+       return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
new file mode 100644 (file)
index 0000000..e5e6a5e
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+                   int cqe, int comp_vector, struct ib_udata *udata)
+{
+       int count;
+
+       if (cqe <= 0) {
+               pr_warn("cqe(%d) <= 0\n", cqe);
+               goto err1;
+       }
+
+       if (cqe > rxe->attr.max_cqe) {
+               pr_warn("cqe(%d) > max_cqe(%d)\n",
+                       cqe, rxe->attr.max_cqe);
+               goto err1;
+       }
+
+       if (cq) {
+               count = queue_count(cq->queue);
+               if (cqe < count) {
+                       pr_warn("cqe(%d) < current # elements in queue (%d)",
+                               cqe, count);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static void rxe_send_complete(unsigned long data)
+{
+       struct rxe_cq *cq = (struct rxe_cq *)data;
+
+       cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+                    int comp_vector, struct ib_ucontext *context,
+                    struct ib_udata *udata)
+{
+       int err;
+
+       cq->queue = rxe_queue_init(rxe, &cqe,
+                                  sizeof(struct rxe_cqe));
+       if (!cq->queue) {
+               pr_warn("unable to create cq\n");
+               return -ENOMEM;
+       }
+
+       err = do_mmap_info(rxe, udata, false, context, cq->queue->buf,
+                          cq->queue->buf_size, &cq->queue->ip);
+       if (err) {
+               kvfree(cq->queue->buf);
+               kfree(cq->queue);
+               return err;
+       }
+
+       if (udata)
+               cq->is_user = 1;
+
+       tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq);
+
+       spin_lock_init(&cq->cq_lock);
+       cq->ibcq.cqe = cqe;
+       return 0;
+}
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, struct ib_udata *udata)
+{
+       int err;
+
+       err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe,
+                              sizeof(struct rxe_cqe),
+                              cq->queue->ip ? cq->queue->ip->context : NULL,
+                              udata, NULL, &cq->cq_lock);
+       if (!err)
+               cq->ibcq.cqe = cqe;
+
+       return err;
+}
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
+{
+       struct ib_event ev;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->cq_lock, flags);
+
+       if (unlikely(queue_full(cq->queue))) {
+               spin_unlock_irqrestore(&cq->cq_lock, flags);
+               if (cq->ibcq.event_handler) {
+                       ev.device = cq->ibcq.device;
+                       ev.element.cq = &cq->ibcq;
+                       ev.event = IB_EVENT_CQ_ERR;
+                       cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+               }
+
+               return -EBUSY;
+       }
+
+       memcpy(producer_addr(cq->queue), cqe, sizeof(*cqe));
+
+       /* make sure all changes to the CQ are written before we update the
+        * producer pointer
+        */
+       smp_wmb();
+
+       advance_producer(cq->queue);
+       spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+       if ((cq->notify == IB_CQ_NEXT_COMP) ||
+           (cq->notify == IB_CQ_SOLICITED && solicited)) {
+               cq->notify = 0;
+               tasklet_schedule(&cq->comp_task);
+       }
+
+       return 0;
+}
+
+void rxe_cq_cleanup(void *arg)
+{
+       struct rxe_cq *cq = arg;
+
+       if (cq->queue)
+               rxe_queue_cleanup(cq->queue);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_dma.c b/drivers/infiniband/sw/rxe/rxe_dma.c
new file mode 100644 (file)
index 0000000..7634c1a
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+#define DMA_BAD_ADDER ((u64)0)
+
+static int rxe_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == DMA_BAD_ADDER;
+}
+
+static u64 rxe_dma_map_single(struct ib_device *dev,
+                             void *cpu_addr, size_t size,
+                             enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+       return (uintptr_t)cpu_addr;
+}
+
+static void rxe_dma_unmap_single(struct ib_device *dev,
+                                u64 addr, size_t size,
+                                enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+}
+
+static u64 rxe_dma_map_page(struct ib_device *dev,
+                           struct page *page,
+                           unsigned long offset,
+                           size_t size, enum dma_data_direction direction)
+{
+       u64 addr;
+
+       WARN_ON(!valid_dma_direction(direction));
+
+       if (offset + size > PAGE_SIZE) {
+               addr = DMA_BAD_ADDER;
+               goto done;
+       }
+
+       addr = (uintptr_t)page_address(page);
+       if (addr)
+               addr += offset;
+
+done:
+       return addr;
+}
+
+static void rxe_dma_unmap_page(struct ib_device *dev,
+                              u64 addr, size_t size,
+                              enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+}
+
+static int rxe_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                     int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       u64 addr;
+       int i;
+       int ret = nents;
+
+       WARN_ON(!valid_dma_direction(direction));
+
+       for_each_sg(sgl, sg, nents, i) {
+               addr = (uintptr_t)page_address(sg_page(sg));
+               if (!addr) {
+                       ret = 0;
+                       break;
+               }
+               sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+               sg->dma_length = sg->length;
+#endif
+       }
+
+       return ret;
+}
+
+static void rxe_unmap_sg(struct ib_device *dev,
+                        struct scatterlist *sg, int nents,
+                        enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+}
+
+static void rxe_sync_single_for_cpu(struct ib_device *dev,
+                                   u64 addr,
+                                   size_t size, enum dma_data_direction dir)
+{
+}
+
+static void rxe_sync_single_for_device(struct ib_device *dev,
+                                      u64 addr,
+                                      size_t size, enum dma_data_direction dir)
+{
+}
+
+static void *rxe_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                   u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p)
+               addr = page_address(p);
+
+       if (dma_handle)
+               *dma_handle = (uintptr_t)addr;
+
+       return addr;
+}
+
+static void rxe_dma_free_coherent(struct ib_device *dev, size_t size,
+                                 void *cpu_addr, u64 dma_handle)
+{
+       free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops rxe_dma_mapping_ops = {
+       .mapping_error          = rxe_mapping_error,
+       .map_single             = rxe_dma_map_single,
+       .unmap_single           = rxe_dma_unmap_single,
+       .map_page               = rxe_dma_map_page,
+       .unmap_page             = rxe_dma_unmap_page,
+       .map_sg                 = rxe_map_sg,
+       .unmap_sg               = rxe_unmap_sg,
+       .sync_single_for_cpu    = rxe_sync_single_for_cpu,
+       .sync_single_for_device = rxe_sync_single_for_device,
+       .alloc_coherent         = rxe_dma_alloc_coherent,
+       .free_coherent          = rxe_dma_free_coherent
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
new file mode 100644 (file)
index 0000000..d57b5e9
--- /dev/null
@@ -0,0 +1,952 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_HDR_H
+#define RXE_HDR_H
+
+/* extracted information about a packet carried in an sk_buff struct fits in
+ * the skbuff cb array. Must be at most 48 bytes. stored in control block of
+ * sk_buff for received packets.
+ */
+struct rxe_pkt_info {
+       struct rxe_dev          *rxe;           /* device that owns packet */
+       struct rxe_qp           *qp;            /* qp that owns packet */
+       struct rxe_send_wqe     *wqe;           /* send wqe */
+       u8                      *hdr;           /* points to bth */
+       u32                     mask;           /* useful info about pkt */
+       u32                     psn;            /* bth psn of packet */
+       u16                     pkey_index;     /* partition of pkt */
+       u16                     paylen;         /* length of bth - icrc */
+       u8                      port_num;       /* port pkt received on */
+       u8                      opcode;         /* bth opcode of packet */
+       u8                      offset;         /* bth offset from pkt->hdr */
+};
+
+/* Macros should be used only for received skb */
+#define SKB_TO_PKT(skb) ((struct rxe_pkt_info *)(skb)->cb)
+#define PKT_TO_SKB(pkt) container_of((void *)(pkt), struct sk_buff, cb)
+
+/*
+ * IBA header types and methods
+ *
+ * Some of these are for reference and completeness only since
+ * rxe does not currently support RD transport
+ * most of this could be moved into IB core. ib_pack.h has
+ * part of this but is incomplete
+ *
+ * Header specific routines to insert/extract values to/from headers
+ * the routines that are named __hhh_(set_)fff() take a pointer to a
+ * hhh header and get(set) the fff field. The routines named
+ * hhh_(set_)fff take a packet info struct and find the
+ * header and field based on the opcode in the packet.
+ * Conversion to/from network byte order from cpu order is also done.
+ */
+
+#define RXE_ICRC_SIZE          (4)
+#define RXE_MAX_HDR_LENGTH     (80)
+
+/******************************************************************************
+ * Base Transport Header
+ ******************************************************************************/
+struct rxe_bth {
+       u8                      opcode;
+       u8                      flags;
+       __be16                  pkey;
+       __be32                  qpn;
+       __be32                  apsn;
+};
+
+#define BTH_TVER               (0)
+#define BTH_DEF_PKEY           (0xffff)
+
+#define BTH_SE_MASK            (0x80)
+#define BTH_MIG_MASK           (0x40)
+#define BTH_PAD_MASK           (0x30)
+#define BTH_TVER_MASK          (0x0f)
+#define BTH_FECN_MASK          (0x80000000)
+#define BTH_BECN_MASK          (0x40000000)
+#define BTH_RESV6A_MASK                (0x3f000000)
+#define BTH_QPN_MASK           (0x00ffffff)
+#define BTH_ACK_MASK           (0x80000000)
+#define BTH_RESV7_MASK         (0x7f000000)
+#define BTH_PSN_MASK           (0x00ffffff)
+
+static inline u8 __bth_opcode(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return bth->opcode;
+}
+
+static inline void __bth_set_opcode(void *arg, u8 opcode)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->opcode = opcode;
+}
+
+static inline u8 __bth_se(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (BTH_SE_MASK & bth->flags);
+}
+
+static inline void __bth_set_se(void *arg, int se)
+{
+       struct rxe_bth *bth = arg;
+
+       if (se)
+               bth->flags |= BTH_SE_MASK;
+       else
+               bth->flags &= ~BTH_SE_MASK;
+}
+
+static inline u8 __bth_mig(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (BTH_MIG_MASK & bth->flags);
+}
+
+static inline void __bth_set_mig(void *arg, u8 mig)
+{
+       struct rxe_bth *bth = arg;
+
+       if (mig)
+               bth->flags |= BTH_MIG_MASK;
+       else
+               bth->flags &= ~BTH_MIG_MASK;
+}
+
+static inline u8 __bth_pad(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return (BTH_PAD_MASK & bth->flags) >> 4;
+}
+
+static inline void __bth_set_pad(void *arg, u8 pad)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->flags = (BTH_PAD_MASK & (pad << 4)) |
+                       (~BTH_PAD_MASK & bth->flags);
+}
+
+static inline u8 __bth_tver(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return BTH_TVER_MASK & bth->flags;
+}
+
+static inline void __bth_set_tver(void *arg, u8 tver)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->flags = (BTH_TVER_MASK & tver) |
+                       (~BTH_TVER_MASK & bth->flags);
+}
+
+static inline u16 __bth_pkey(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return be16_to_cpu(bth->pkey);
+}
+
+static inline void __bth_set_pkey(void *arg, u16 pkey)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->pkey = cpu_to_be16(pkey);
+}
+
+static inline u32 __bth_qpn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return BTH_QPN_MASK & be32_to_cpu(bth->qpn);
+}
+
+static inline void __bth_set_qpn(void *arg, u32 qpn)
+{
+       struct rxe_bth *bth = arg;
+       u32 resvqpn = be32_to_cpu(bth->qpn);
+
+       bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) |
+                              (~BTH_QPN_MASK & resvqpn));
+}
+
+static inline int __bth_fecn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_fecn(void *arg, int fecn)
+{
+       struct rxe_bth *bth = arg;
+
+       if (fecn)
+               bth->qpn |= cpu_to_be32(BTH_FECN_MASK);
+       else
+               bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK);
+}
+
+static inline int __bth_becn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_becn(void *arg, int becn)
+{
+       struct rxe_bth *bth = arg;
+
+       if (becn)
+               bth->qpn |= cpu_to_be32(BTH_BECN_MASK);
+       else
+               bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK);
+}
+
+static inline u8 __bth_resv6a(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24;
+}
+
+static inline void __bth_set_resv6a(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK);
+}
+
+static inline int __bth_ack(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn);
+}
+
+static inline void __bth_set_ack(void *arg, int ack)
+{
+       struct rxe_bth *bth = arg;
+
+       if (ack)
+               bth->apsn |= cpu_to_be32(BTH_ACK_MASK);
+       else
+               bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK);
+}
+
+static inline void __bth_set_resv7(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK);
+}
+
+static inline u32 __bth_psn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return BTH_PSN_MASK & be32_to_cpu(bth->apsn);
+}
+
+static inline void __bth_set_psn(void *arg, u32 psn)
+{
+       struct rxe_bth *bth = arg;
+       u32 apsn = be32_to_cpu(bth->apsn);
+
+       bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) |
+                       (~BTH_PSN_MASK & apsn));
+}
+
+static inline u8 bth_opcode(struct rxe_pkt_info *pkt)
+{
+       return __bth_opcode(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode)
+{
+       __bth_set_opcode(pkt->hdr + pkt->offset, opcode);
+}
+
+static inline u8 bth_se(struct rxe_pkt_info *pkt)
+{
+       return __bth_se(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_se(struct rxe_pkt_info *pkt, int se)
+{
+       __bth_set_se(pkt->hdr + pkt->offset, se);
+}
+
+static inline u8 bth_mig(struct rxe_pkt_info *pkt)
+{
+       return __bth_mig(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig)
+{
+       __bth_set_mig(pkt->hdr + pkt->offset, mig);
+}
+
+static inline u8 bth_pad(struct rxe_pkt_info *pkt)
+{
+       return __bth_pad(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad)
+{
+       __bth_set_pad(pkt->hdr + pkt->offset, pad);
+}
+
+static inline u8 bth_tver(struct rxe_pkt_info *pkt)
+{
+       return __bth_tver(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver)
+{
+       __bth_set_tver(pkt->hdr + pkt->offset, tver);
+}
+
+static inline u16 bth_pkey(struct rxe_pkt_info *pkt)
+{
+       return __bth_pkey(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey)
+{
+       __bth_set_pkey(pkt->hdr + pkt->offset, pkey);
+}
+
+static inline u32 bth_qpn(struct rxe_pkt_info *pkt)
+{
+       return __bth_qpn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn)
+{
+       __bth_set_qpn(pkt->hdr + pkt->offset, qpn);
+}
+
+static inline int bth_fecn(struct rxe_pkt_info *pkt)
+{
+       return __bth_fecn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn)
+{
+       __bth_set_fecn(pkt->hdr + pkt->offset, fecn);
+}
+
+static inline int bth_becn(struct rxe_pkt_info *pkt)
+{
+       return __bth_becn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn)
+{
+       __bth_set_becn(pkt->hdr + pkt->offset, becn);
+}
+
+static inline u8 bth_resv6a(struct rxe_pkt_info *pkt)
+{
+       return __bth_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_resv6a(struct rxe_pkt_info *pkt)
+{
+       __bth_set_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline int bth_ack(struct rxe_pkt_info *pkt)
+{
+       return __bth_ack(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack)
+{
+       __bth_set_ack(pkt->hdr + pkt->offset, ack);
+}
+
+static inline void bth_set_resv7(struct rxe_pkt_info *pkt)
+{
+       __bth_set_resv7(pkt->hdr + pkt->offset);
+}
+
+static inline u32 bth_psn(struct rxe_pkt_info *pkt)
+{
+       return __bth_psn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn)
+{
+       __bth_set_psn(pkt->hdr + pkt->offset, psn);
+}
+
+static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se,
+                           int mig, int pad, u16 pkey, u32 qpn, int ack_req,
+                           u32 psn)
+{
+       struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr + pkt->offset);
+
+       bth->opcode = opcode;
+       bth->flags = (pad << 4) & BTH_PAD_MASK;
+       if (se)
+               bth->flags |= BTH_SE_MASK;
+       if (mig)
+               bth->flags |= BTH_MIG_MASK;
+       bth->pkey = cpu_to_be16(pkey);
+       bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK);
+       psn &= BTH_PSN_MASK;
+       if (ack_req)
+               psn |= BTH_ACK_MASK;
+       bth->apsn = cpu_to_be32(psn);
+}
+
+/******************************************************************************
+ * Reliable Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_rdeth {
+       __be32                  een;
+};
+
+#define RDETH_EEN_MASK         (0x00ffffff)
+
+static inline u8 __rdeth_een(void *arg)
+{
+       struct rxe_rdeth *rdeth = arg;
+
+       return RDETH_EEN_MASK & be32_to_cpu(rdeth->een);
+}
+
+static inline void __rdeth_set_een(void *arg, u32 een)
+{
+       struct rxe_rdeth *rdeth = arg;
+
+       rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een);
+}
+
+static inline u8 rdeth_een(struct rxe_pkt_info *pkt)
+{
+       return __rdeth_een(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RDETH]);
+}
+
+static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een)
+{
+       __rdeth_set_een(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een);
+}
+
+/******************************************************************************
+ * Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_deth {
+       __be32                  qkey;
+       __be32                  sqp;
+};
+
+#define GSI_QKEY               (0x80010000)
+#define DETH_SQP_MASK          (0x00ffffff)
+
+static inline u32 __deth_qkey(void *arg)
+{
+       struct rxe_deth *deth = arg;
+
+       return be32_to_cpu(deth->qkey);
+}
+
+static inline void __deth_set_qkey(void *arg, u32 qkey)
+{
+       struct rxe_deth *deth = arg;
+
+       deth->qkey = cpu_to_be32(qkey);
+}
+
+static inline u32 __deth_sqp(void *arg)
+{
+       struct rxe_deth *deth = arg;
+
+       return DETH_SQP_MASK & be32_to_cpu(deth->sqp);
+}
+
+static inline void __deth_set_sqp(void *arg, u32 sqp)
+{
+       struct rxe_deth *deth = arg;
+
+       deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp);
+}
+
+static inline u32 deth_qkey(struct rxe_pkt_info *pkt)
+{
+       return __deth_qkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey)
+{
+       __deth_set_qkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey);
+}
+
+static inline u32 deth_sqp(struct rxe_pkt_info *pkt)
+{
+       return __deth_sqp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp)
+{
+       __deth_set_sqp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp);
+}
+
+/******************************************************************************
+ * RDMA Extended Transport Header
+ ******************************************************************************/
+struct rxe_reth {
+       __be64                  va;
+       __be32                  rkey;
+       __be32                  len;
+};
+
+static inline u64 __reth_va(void *arg)
+{
+       struct rxe_reth *reth = arg;
+
+       return be64_to_cpu(reth->va);
+}
+
+static inline void __reth_set_va(void *arg, u64 va)
+{
+       struct rxe_reth *reth = arg;
+
+       reth->va = cpu_to_be64(va);
+}
+
+static inline u32 __reth_rkey(void *arg)
+{
+       struct rxe_reth *reth = arg;
+
+       return be32_to_cpu(reth->rkey);
+}
+
+static inline void __reth_set_rkey(void *arg, u32 rkey)
+{
+       struct rxe_reth *reth = arg;
+
+       reth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 __reth_len(void *arg)
+{
+       struct rxe_reth *reth = arg;
+
+       return be32_to_cpu(reth->len);
+}
+
+static inline void __reth_set_len(void *arg, u32 len)
+{
+       struct rxe_reth *reth = arg;
+
+       reth->len = cpu_to_be32(len);
+}
+
+static inline u64 reth_va(struct rxe_pkt_info *pkt)
+{
+       return __reth_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+       __reth_set_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH], va);
+}
+
+static inline u32 reth_rkey(struct rxe_pkt_info *pkt)
+{
+       return __reth_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+       __reth_set_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey);
+}
+
+static inline u32 reth_len(struct rxe_pkt_info *pkt)
+{
+       return __reth_len(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len)
+{
+       __reth_set_len(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH], len);
+}
+
+/******************************************************************************
+ * Atomic Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmeth {
+       __be64                  va;
+       __be32                  rkey;
+       __be64                  swap_add;
+       __be64                  comp;
+} __attribute__((__packed__));
+
+static inline u64 __atmeth_va(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be64_to_cpu(atmeth->va);
+}
+
+static inline void __atmeth_set_va(void *arg, u64 va)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->va = cpu_to_be64(va);
+}
+
+static inline u32 __atmeth_rkey(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be32_to_cpu(atmeth->rkey);
+}
+
+static inline void __atmeth_set_rkey(void *arg, u32 rkey)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u64 __atmeth_swap_add(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be64_to_cpu(atmeth->swap_add);
+}
+
+static inline void __atmeth_set_swap_add(void *arg, u64 swap_add)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->swap_add = cpu_to_be64(swap_add);
+}
+
+static inline u64 __atmeth_comp(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be64_to_cpu(atmeth->comp);
+}
+
+static inline void __atmeth_set_comp(void *arg, u64 comp)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->comp = cpu_to_be64(comp);
+}
+
+static inline u64 atmeth_va(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+       __atmeth_set_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va);
+}
+
+static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+       __atmeth_set_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey);
+}
+
+static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_swap_add(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add)
+{
+       __atmeth_set_swap_add(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add);
+}
+
+static inline u64 atmeth_comp(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_comp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp)
+{
+       __atmeth_set_comp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp);
+}
+
+/******************************************************************************
+ * Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_aeth {
+       __be32                  smsn;
+};
+
+#define AETH_SYN_MASK          (0xff000000)
+#define AETH_MSN_MASK          (0x00ffffff)
+
+enum aeth_syndrome {
+       AETH_TYPE_MASK          = 0xe0,
+       AETH_ACK                = 0x00,
+       AETH_RNR_NAK            = 0x20,
+       AETH_RSVD               = 0x40,
+       AETH_NAK                = 0x60,
+       AETH_ACK_UNLIMITED      = 0x1f,
+       AETH_NAK_PSN_SEQ_ERROR  = 0x60,
+       AETH_NAK_INVALID_REQ    = 0x61,
+       AETH_NAK_REM_ACC_ERR    = 0x62,
+       AETH_NAK_REM_OP_ERR     = 0x63,
+       AETH_NAK_INV_RD_REQ     = 0x64,
+};
+
+static inline u8 __aeth_syn(void *arg)
+{
+       struct rxe_aeth *aeth = arg;
+
+       return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24;
+}
+
+static inline void __aeth_set_syn(void *arg, u8 syn)
+{
+       struct rxe_aeth *aeth = arg;
+       u32 smsn = be32_to_cpu(aeth->smsn);
+
+       aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) |
+                        (~AETH_SYN_MASK & smsn));
+}
+
+static inline u32 __aeth_msn(void *arg)
+{
+       struct rxe_aeth *aeth = arg;
+
+       return AETH_MSN_MASK & be32_to_cpu(aeth->smsn);
+}
+
+static inline void __aeth_set_msn(void *arg, u32 msn)
+{
+       struct rxe_aeth *aeth = arg;
+       u32 smsn = be32_to_cpu(aeth->smsn);
+
+       aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) |
+                        (~AETH_MSN_MASK & smsn));
+}
+
+static inline u8 aeth_syn(struct rxe_pkt_info *pkt)
+{
+       return __aeth_syn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn)
+{
+       __aeth_set_syn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn);
+}
+
+static inline u32 aeth_msn(struct rxe_pkt_info *pkt)
+{
+       return __aeth_msn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn)
+{
+       __aeth_set_msn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn);
+}
+
+/******************************************************************************
+ * Atomic Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmack {
+       __be64                  orig;
+};
+
+static inline u64 __atmack_orig(void *arg)
+{
+       struct rxe_atmack *atmack = arg;
+
+       return be64_to_cpu(atmack->orig);
+}
+
+static inline void __atmack_set_orig(void *arg, u64 orig)
+{
+       struct rxe_atmack *atmack = arg;
+
+       atmack->orig = cpu_to_be64(orig);
+}
+
+static inline u64 atmack_orig(struct rxe_pkt_info *pkt)
+{
+       return __atmack_orig(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]);
+}
+
+static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig)
+{
+       __atmack_set_orig(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig);
+}
+
+/******************************************************************************
+ * Immediate Extended Transport Header
+ ******************************************************************************/
+struct rxe_immdt {
+       __be32                  imm;
+};
+
+static inline __be32 __immdt_imm(void *arg)
+{
+       struct rxe_immdt *immdt = arg;
+
+       return immdt->imm;
+}
+
+static inline void __immdt_set_imm(void *arg, __be32 imm)
+{
+       struct rxe_immdt *immdt = arg;
+
+       immdt->imm = imm;
+}
+
+static inline __be32 immdt_imm(struct rxe_pkt_info *pkt)
+{
+       return __immdt_imm(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]);
+}
+
+static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm)
+{
+       __immdt_set_imm(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm);
+}
+
+/******************************************************************************
+ * Invalidate Extended Transport Header
+ ******************************************************************************/
+struct rxe_ieth {
+       __be32                  rkey;
+};
+
+static inline u32 __ieth_rkey(void *arg)
+{
+       struct rxe_ieth *ieth = arg;
+
+       return be32_to_cpu(ieth->rkey);
+}
+
+static inline void __ieth_set_rkey(void *arg, u32 rkey)
+{
+       struct rxe_ieth *ieth = arg;
+
+       ieth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 ieth_rkey(struct rxe_pkt_info *pkt)
+{
+       return __ieth_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IETH]);
+}
+
+static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+       __ieth_set_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey);
+}
+
+enum rxe_hdr_length {
+       RXE_BTH_BYTES           = sizeof(struct rxe_bth),
+       RXE_DETH_BYTES          = sizeof(struct rxe_deth),
+       RXE_IMMDT_BYTES         = sizeof(struct rxe_immdt),
+       RXE_RETH_BYTES          = sizeof(struct rxe_reth),
+       RXE_AETH_BYTES          = sizeof(struct rxe_aeth),
+       RXE_ATMACK_BYTES        = sizeof(struct rxe_atmack),
+       RXE_ATMETH_BYTES        = sizeof(struct rxe_atmeth),
+       RXE_IETH_BYTES          = sizeof(struct rxe_ieth),
+       RXE_RDETH_BYTES         = sizeof(struct rxe_rdeth),
+};
+
+static inline size_t header_size(struct rxe_pkt_info *pkt)
+{
+       return pkt->offset + rxe_opcode[pkt->opcode].length;
+}
+
+static inline void *payload_addr(struct rxe_pkt_info *pkt)
+{
+       return pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD];
+}
+
+static inline size_t payload_size(struct rxe_pkt_info *pkt)
+{
+       return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]
+               - bth_pad(pkt) - RXE_ICRC_SIZE;
+}
+
+#endif /* RXE_HDR_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c
new file mode 100644 (file)
index 0000000..413b56b
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* Compute a partial ICRC for all the IB transport headers. */
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+       unsigned int bth_offset = 0;
+       struct iphdr *ip4h = NULL;
+       struct ipv6hdr *ip6h = NULL;
+       struct udphdr *udph;
+       struct rxe_bth *bth;
+       int crc;
+       int length;
+       int hdr_size = sizeof(struct udphdr) +
+               (skb->protocol == htons(ETH_P_IP) ?
+               sizeof(struct iphdr) : sizeof(struct ipv6hdr));
+       /* pseudo header buffer size is calculate using ipv6 header size since
+        * it is bigger than ipv4
+        */
+       u8 pshdr[sizeof(struct udphdr) +
+               sizeof(struct ipv6hdr) +
+               RXE_BTH_BYTES];
+
+       /* This seed is the result of computing a CRC with a seed of
+        * 0xfffffff and 8 bytes of 0xff representing a masked LRH.
+        */
+       crc = 0xdebb20e3;
+
+       if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */
+               memcpy(pshdr, ip_hdr(skb), hdr_size);
+               ip4h = (struct iphdr *)pshdr;
+               udph = (struct udphdr *)(ip4h + 1);
+
+               ip4h->ttl = 0xff;
+               ip4h->check = CSUM_MANGLED_0;
+               ip4h->tos = 0xff;
+       } else {                                /* IPv6 */
+               memcpy(pshdr, ipv6_hdr(skb), hdr_size);
+               ip6h = (struct ipv6hdr *)pshdr;
+               udph = (struct udphdr *)(ip6h + 1);
+
+               memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl));
+               ip6h->priority = 0xf;
+               ip6h->hop_limit = 0xff;
+       }
+       udph->check = CSUM_MANGLED_0;
+
+       bth_offset += hdr_size;
+
+       memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES);
+       bth = (struct rxe_bth *)&pshdr[bth_offset];
+
+       /* exclude bth.resv8a */
+       bth->qpn |= cpu_to_be32(~BTH_QPN_MASK);
+
+       length = hdr_size + RXE_BTH_BYTES;
+       crc = crc32_le(crc, pshdr, length);
+
+       /* And finish to compute the CRC on the remainder of the headers. */
+       crc = crc32_le(crc, pkt->hdr + RXE_BTH_BYTES,
+                      rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES);
+       return crc;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
new file mode 100644 (file)
index 0000000..4a5484e
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_LOC_H
+#define RXE_LOC_H
+
+/* rxe_av.c */
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr);
+
+int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num,
+                    struct rxe_av *av, struct ib_ah_attr *attr);
+
+int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av,
+                  struct ib_ah_attr *attr);
+
+int rxe_av_fill_ip_info(struct rxe_dev *rxe,
+                       struct rxe_av *av,
+                       struct ib_ah_attr *attr,
+                       struct ib_gid_attr *sgid_attr,
+                       union ib_gid *sgid);
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt);
+
+/* rxe_cq.c */
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+                   int cqe, int comp_vector, struct ib_udata *udata);
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+                    int comp_vector, struct ib_ucontext *context,
+                    struct ib_udata *udata);
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, struct ib_udata *udata);
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
+
+void rxe_cq_cleanup(void *arg);
+
+/* rxe_mcast.c */
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+                     struct rxe_mc_grp **grp_p);
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                          struct rxe_mc_grp *grp);
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                           union ib_gid *mgid);
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp);
+
+void rxe_mc_cleanup(void *arg);
+
+/* rxe_mmap.c */
+struct rxe_mmap_info {
+       struct list_head        pending_mmaps;
+       struct ib_ucontext      *context;
+       struct kref             ref;
+       void                    *obj;
+
+       struct mminfo info;
+};
+
+void rxe_mmap_release(struct kref *ref);
+
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev,
+                                          u32 size,
+                                          struct ib_ucontext *context,
+                                          void *obj);
+
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+/* rxe_mr.c */
+enum copy_direction {
+       to_mem_obj,
+       from_mem_obj,
+};
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+                    int access, struct rxe_mem *mem);
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+                     u64 length, u64 iova, int access, struct ib_udata *udata,
+                     struct rxe_mem *mr);
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+                     int max_pages, struct rxe_mem *mem);
+
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr,
+                int length, enum copy_direction dir, u32 *crcp);
+
+int copy_data(struct rxe_dev *rxe, struct rxe_pd *pd, int access,
+             struct rxe_dma_info *dma, void *addr, int length,
+             enum copy_direction dir, u32 *crcp);
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length);
+
+enum lookup_type {
+       lookup_local,
+       lookup_remote,
+};
+
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+                          enum lookup_type type);
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length);
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+                     u64 *page, int num_pages, u64 iova);
+
+void rxe_mem_cleanup(void *arg);
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
+
+/* rxe_qp.c */
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init);
+
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+                    struct ib_qp_init_attr *init, struct ib_udata *udata,
+                    struct ib_pd *ibpd);
+
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init);
+
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+                   struct ib_qp_attr *attr, int mask);
+
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr,
+                    int mask, struct ib_udata *udata);
+
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask);
+
+void rxe_qp_error(struct rxe_qp *qp);
+
+void rxe_qp_destroy(struct rxe_qp *qp);
+
+void rxe_qp_cleanup(void *arg);
+
+static inline int qp_num(struct rxe_qp *qp)
+{
+       return qp->ibqp.qp_num;
+}
+
+static inline enum ib_qp_type qp_type(struct rxe_qp *qp)
+{
+       return qp->ibqp.qp_type;
+}
+
+static inline enum ib_qp_state qp_state(struct rxe_qp *qp)
+{
+       return qp->attr.qp_state;
+}
+
+static inline int qp_mtu(struct rxe_qp *qp)
+{
+       if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
+               return qp->attr.path_mtu;
+       else
+               return RXE_PORT_MAX_MTU;
+}
+
+static inline int rcv_wqe_size(int max_sge)
+{
+       return sizeof(struct rxe_recv_wqe) +
+               max_sge * sizeof(struct ib_sge);
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res);
+
+static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
+{
+       qp->resp.res_head++;
+       if (unlikely(qp->resp.res_head == qp->attr.max_rd_atomic))
+               qp->resp.res_head = 0;
+}
+
+void retransmit_timer(unsigned long data);
+void rnr_nak_timer(unsigned long data);
+
+void dump_qp(struct rxe_qp *qp);
+
+/* rxe_srq.c */
+#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT)
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                    struct ib_srq_attr *attr, enum ib_srq_attr_mask mask);
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_init_attr *init,
+                     struct ib_ucontext *context, struct ib_udata *udata);
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+                     struct ib_udata *udata);
+
+extern struct ib_dma_mapping_ops rxe_dma_mapping_ops;
+
+void rxe_release(struct kref *kref);
+
+int rxe_completer(void *arg);
+int rxe_requester(void *arg);
+int rxe_responder(void *arg);
+
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb);
+
+void rxe_resp_queue_pkt(struct rxe_dev *rxe,
+                       struct rxe_qp *qp, struct sk_buff *skb);
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe,
+                       struct rxe_qp *qp, struct sk_buff *skb);
+
+static inline unsigned wr_opcode_mask(int opcode, struct rxe_qp *qp)
+{
+       return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
+}
+
+static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
+                                 struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+       int err;
+       int is_request = pkt->mask & RXE_REQ_MASK;
+
+       if ((is_request && (qp->req.state != QP_STATE_READY)) ||
+           (!is_request && (qp->resp.state != QP_STATE_READY))) {
+               pr_info("Packet dropped. QP is not in ready state\n");
+               goto drop;
+       }
+
+       if (pkt->mask & RXE_LOOPBACK_MASK) {
+               memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
+               err = rxe->ifc_ops->loopback(skb);
+       } else {
+               err = rxe->ifc_ops->send(rxe, pkt, skb);
+       }
+
+       if (err) {
+               rxe->xmit_errors++;
+               return err;
+       }
+
+       atomic_inc(&qp->skb_out);
+
+       if ((qp_type(qp) != IB_QPT_RC) &&
+           (pkt->mask & RXE_END_MASK)) {
+               pkt->wqe->state = wqe_state_done;
+               rxe_run_task(&qp->comp.task, 1);
+       }
+
+       goto done;
+
+drop:
+       kfree_skb(skb);
+       err = 0;
+done:
+       return err;
+}
+
+#endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
new file mode 100644 (file)
index 0000000..fa95544
--- /dev/null
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+                     struct rxe_mc_grp **grp_p)
+{
+       int err;
+       struct rxe_mc_grp *grp;
+
+       if (rxe->attr.max_mcast_qp_attach == 0) {
+               err = -EINVAL;
+               goto err1;
+       }
+
+       grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+       if (grp)
+               goto done;
+
+       grp = rxe_alloc(&rxe->mc_grp_pool);
+       if (!grp) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       INIT_LIST_HEAD(&grp->qp_list);
+       spin_lock_init(&grp->mcg_lock);
+       grp->rxe = rxe;
+
+       rxe_add_key(grp, mgid);
+
+       err = rxe->ifc_ops->mcast_add(rxe, mgid);
+       if (err)
+               goto err2;
+
+done:
+       *grp_p = grp;
+       return 0;
+
+err2:
+       rxe_drop_ref(grp);
+err1:
+       return err;
+}
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                          struct rxe_mc_grp *grp)
+{
+       int err;
+       struct rxe_mc_elem *elem;
+
+       /* check to see of the qp is already a member of the group */
+       spin_lock_bh(&qp->grp_lock);
+       spin_lock_bh(&grp->mcg_lock);
+       list_for_each_entry(elem, &grp->qp_list, qp_list) {
+               if (elem->qp == qp) {
+                       err = 0;
+                       goto out;
+               }
+       }
+
+       if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       elem = rxe_alloc(&rxe->mc_elem_pool);
+       if (!elem) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       /* each qp holds a ref on the grp */
+       rxe_add_ref(grp);
+
+       grp->num_qp++;
+       elem->qp = qp;
+       elem->grp = grp;
+
+       list_add(&elem->qp_list, &grp->qp_list);
+       list_add(&elem->grp_list, &qp->grp_list);
+
+       err = 0;
+out:
+       spin_unlock_bh(&grp->mcg_lock);
+       spin_unlock_bh(&qp->grp_lock);
+       return err;
+}
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                           union ib_gid *mgid)
+{
+       struct rxe_mc_grp *grp;
+       struct rxe_mc_elem *elem, *tmp;
+
+       grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+       if (!grp)
+               goto err1;
+
+       spin_lock_bh(&qp->grp_lock);
+       spin_lock_bh(&grp->mcg_lock);
+
+       list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) {
+               if (elem->qp == qp) {
+                       list_del(&elem->qp_list);
+                       list_del(&elem->grp_list);
+                       grp->num_qp--;
+
+                       spin_unlock_bh(&grp->mcg_lock);
+                       spin_unlock_bh(&qp->grp_lock);
+                       rxe_drop_ref(elem);
+                       rxe_drop_ref(grp);      /* ref held by QP */
+                       rxe_drop_ref(grp);      /* ref from get_key */
+                       return 0;
+               }
+       }
+
+       spin_unlock_bh(&grp->mcg_lock);
+       spin_unlock_bh(&qp->grp_lock);
+       rxe_drop_ref(grp);                      /* ref from get_key */
+err1:
+       return -EINVAL;
+}
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp)
+{
+       struct rxe_mc_grp *grp;
+       struct rxe_mc_elem *elem;
+
+       while (1) {
+               spin_lock_bh(&qp->grp_lock);
+               if (list_empty(&qp->grp_list)) {
+                       spin_unlock_bh(&qp->grp_lock);
+                       break;
+               }
+               elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem,
+                                       grp_list);
+               list_del(&elem->grp_list);
+               spin_unlock_bh(&qp->grp_lock);
+
+               grp = elem->grp;
+               spin_lock_bh(&grp->mcg_lock);
+               list_del(&elem->qp_list);
+               grp->num_qp--;
+               spin_unlock_bh(&grp->mcg_lock);
+               rxe_drop_ref(grp);
+               rxe_drop_ref(elem);
+       }
+}
+
+void rxe_mc_cleanup(void *arg)
+{
+       struct rxe_mc_grp *grp = arg;
+       struct rxe_dev *rxe = grp->rxe;
+
+       rxe_drop_key(grp);
+       rxe->ifc_ops->mcast_delete(rxe, &grp->mgid);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c
new file mode 100644 (file)
index 0000000..54b3c7c
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+void rxe_mmap_release(struct kref *ref)
+{
+       struct rxe_mmap_info *ip = container_of(ref,
+                                       struct rxe_mmap_info, ref);
+       struct rxe_dev *rxe = to_rdev(ip->context->device);
+
+       spin_lock_bh(&rxe->pending_lock);
+
+       if (!list_empty(&ip->pending_mmaps))
+               list_del(&ip->pending_mmaps);
+
+       spin_unlock_bh(&rxe->pending_lock);
+
+       vfree(ip->obj);         /* buf */
+       kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the memory region is mapped,
+ * to avoid releasing it.
+ */
+static void rxe_vma_open(struct vm_area_struct *vma)
+{
+       struct rxe_mmap_info *ip = vma->vm_private_data;
+
+       kref_get(&ip->ref);
+}
+
+static void rxe_vma_close(struct vm_area_struct *vma)
+{
+       struct rxe_mmap_info *ip = vma->vm_private_data;
+
+       kref_put(&ip->ref, rxe_mmap_release);
+}
+
+static struct vm_operations_struct rxe_vm_ops = {
+       .open = rxe_vma_open,
+       .close = rxe_vma_close,
+};
+
+/**
+ * rxe_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       struct rxe_dev *rxe = to_rdev(context->device);
+       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+       unsigned long size = vma->vm_end - vma->vm_start;
+       struct rxe_mmap_info *ip, *pp;
+       int ret;
+
+       /*
+        * Search the device's list of objects waiting for a mmap call.
+        * Normally, this list is very short since a call to create a
+        * CQ, QP, or SRQ is soon followed by a call to mmap().
+        */
+       spin_lock_bh(&rxe->pending_lock);
+       list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) {
+               if (context != ip->context || (__u64)offset != ip->info.offset)
+                       continue;
+
+               /* Don't allow a mmap larger than the object. */
+               if (size > ip->info.size) {
+                       pr_err("mmap region is larger than the object!\n");
+                       spin_unlock_bh(&rxe->pending_lock);
+                       ret = -EINVAL;
+                       goto done;
+               }
+
+               goto found_it;
+       }
+       pr_warn("unable to find pending mmap info\n");
+       spin_unlock_bh(&rxe->pending_lock);
+       ret = -EINVAL;
+       goto done;
+
+found_it:
+       list_del_init(&ip->pending_mmaps);
+       spin_unlock_bh(&rxe->pending_lock);
+
+       ret = remap_vmalloc_range(vma, ip->obj, 0);
+       if (ret) {
+               pr_err("rxe: err %d from remap_vmalloc_range\n", ret);
+               goto done;
+       }
+
+       vma->vm_ops = &rxe_vm_ops;
+       vma->vm_private_data = ip;
+       rxe_vma_open(vma);
+done:
+       return ret;
+}
+
+/*
+ * Allocate information for rxe_mmap
+ */
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe,
+                                          u32 size,
+                                          struct ib_ucontext *context,
+                                          void *obj)
+{
+       struct rxe_mmap_info *ip;
+
+       ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+       if (!ip)
+               return NULL;
+
+       size = PAGE_ALIGN(size);
+
+       spin_lock_bh(&rxe->mmap_offset_lock);
+
+       if (rxe->mmap_offset == 0)
+               rxe->mmap_offset = PAGE_SIZE;
+
+       ip->info.offset = rxe->mmap_offset;
+       rxe->mmap_offset += size;
+
+       spin_unlock_bh(&rxe->mmap_offset_lock);
+
+       INIT_LIST_HEAD(&ip->pending_mmaps);
+       ip->info.size = size;
+       ip->context = context;
+       ip->obj = obj;
+       kref_init(&ip->ref);
+
+       return ip;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
new file mode 100644 (file)
index 0000000..f3dab65
--- /dev/null
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/*
+ * lfsr (linear feedback shift register) with period 255
+ */
+static u8 rxe_get_key(void)
+{
+       static unsigned key = 1;
+
+       key = key << 1;
+
+       key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10))
+               ^ (0 != (key & 0x80)) ^ (0 != (key & 0x40));
+
+       key &= 0xff;
+
+       return key;
+}
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length)
+{
+       switch (mem->type) {
+       case RXE_MEM_TYPE_DMA:
+               return 0;
+
+       case RXE_MEM_TYPE_MR:
+       case RXE_MEM_TYPE_FMR:
+               return ((iova < mem->iova) ||
+                       ((iova + length) > (mem->iova + mem->length))) ?
+                       -EFAULT : 0;
+
+       default:
+               return -EFAULT;
+       }
+}
+
+#define IB_ACCESS_REMOTE       (IB_ACCESS_REMOTE_READ          \
+                               | IB_ACCESS_REMOTE_WRITE        \
+                               | IB_ACCESS_REMOTE_ATOMIC)
+
+static void rxe_mem_init(int access, struct rxe_mem *mem)
+{
+       u32 lkey = mem->pelem.index << 8 | rxe_get_key();
+       u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
+
+       if (mem->pelem.pool->type == RXE_TYPE_MR) {
+               mem->ibmr.lkey          = lkey;
+               mem->ibmr.rkey          = rkey;
+       }
+
+       mem->lkey               = lkey;
+       mem->rkey               = rkey;
+       mem->state              = RXE_MEM_STATE_INVALID;
+       mem->type               = RXE_MEM_TYPE_NONE;
+       mem->map_shift          = ilog2(RXE_BUF_PER_MAP);
+}
+
+void rxe_mem_cleanup(void *arg)
+{
+       struct rxe_mem *mem = arg;
+       int i;
+
+       if (mem->umem)
+               ib_umem_release(mem->umem);
+
+       if (mem->map) {
+               for (i = 0; i < mem->num_map; i++)
+                       kfree(mem->map[i]);
+
+               kfree(mem->map);
+       }
+}
+
+static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf)
+{
+       int i;
+       int num_map;
+       struct rxe_map **map = mem->map;
+
+       num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
+
+       mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
+       if (!mem->map)
+               goto err1;
+
+       for (i = 0; i < num_map; i++) {
+               mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
+               if (!mem->map[i])
+                       goto err2;
+       }
+
+       WARN_ON(!is_power_of_2(RXE_BUF_PER_MAP));
+
+       mem->map_shift  = ilog2(RXE_BUF_PER_MAP);
+       mem->map_mask   = RXE_BUF_PER_MAP - 1;
+
+       mem->num_buf = num_buf;
+       mem->num_map = num_map;
+       mem->max_buf = num_map * RXE_BUF_PER_MAP;
+
+       return 0;
+
+err2:
+       for (i--; i >= 0; i--)
+               kfree(mem->map[i]);
+
+       kfree(mem->map);
+err1:
+       return -ENOMEM;
+}
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+                    int access, struct rxe_mem *mem)
+{
+       rxe_mem_init(access, mem);
+
+       mem->pd                 = pd;
+       mem->access             = access;
+       mem->state              = RXE_MEM_STATE_VALID;
+       mem->type               = RXE_MEM_TYPE_DMA;
+
+       return 0;
+}
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+                     u64 length, u64 iova, int access, struct ib_udata *udata,
+                     struct rxe_mem *mem)
+{
+       int                     entry;
+       struct rxe_map          **map;
+       struct rxe_phys_buf     *buf = NULL;
+       struct ib_umem          *umem;
+       struct scatterlist      *sg;
+       int                     num_buf;
+       void                    *vaddr;
+       int err;
+
+       umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+       if (IS_ERR(umem)) {
+               pr_warn("err %d from rxe_umem_get\n",
+                       (int)PTR_ERR(umem));
+               err = -EINVAL;
+               goto err1;
+       }
+
+       mem->umem = umem;
+       num_buf = umem->nmap;
+
+       rxe_mem_init(access, mem);
+
+       err = rxe_mem_alloc(rxe, mem, num_buf);
+       if (err) {
+               pr_warn("err %d from rxe_mem_alloc\n", err);
+               ib_umem_release(umem);
+               goto err1;
+       }
+
+       WARN_ON(!is_power_of_2(umem->page_size));
+
+       mem->page_shift         = ilog2(umem->page_size);
+       mem->page_mask          = umem->page_size - 1;
+
+       num_buf                 = 0;
+       map                     = mem->map;
+       if (length > 0) {
+               buf = map[0]->buf;
+
+               for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+                       vaddr = page_address(sg_page(sg));
+                       if (!vaddr) {
+                               pr_warn("null vaddr\n");
+                               err = -ENOMEM;
+                               goto err1;
+                       }
+
+                       buf->addr = (uintptr_t)vaddr;
+                       buf->size = umem->page_size;
+                       num_buf++;
+                       buf++;
+
+                       if (num_buf >= RXE_BUF_PER_MAP) {
+                               map++;
+                               buf = map[0]->buf;
+                               num_buf = 0;
+                       }
+               }
+       }
+
+       mem->pd                 = pd;
+       mem->umem               = umem;
+       mem->access             = access;
+       mem->length             = length;
+       mem->iova               = iova;
+       mem->va                 = start;
+       mem->offset             = ib_umem_offset(umem);
+       mem->state              = RXE_MEM_STATE_VALID;
+       mem->type               = RXE_MEM_TYPE_MR;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+                     int max_pages, struct rxe_mem *mem)
+{
+       int err;
+
+       rxe_mem_init(0, mem);
+
+       /* In fastreg, we also set the rkey */
+       mem->ibmr.rkey = mem->ibmr.lkey;
+
+       err = rxe_mem_alloc(rxe, mem, max_pages);
+       if (err)
+               goto err1;
+
+       mem->pd                 = pd;
+       mem->max_buf            = max_pages;
+       mem->state              = RXE_MEM_STATE_FREE;
+       mem->type               = RXE_MEM_TYPE_MR;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static void lookup_iova(
+       struct rxe_mem  *mem,
+       u64                     iova,
+       int                     *m_out,
+       int                     *n_out,
+       size_t                  *offset_out)
+{
+       size_t                  offset = iova - mem->iova + mem->offset;
+       int                     map_index;
+       int                     buf_index;
+       u64                     length;
+
+       if (likely(mem->page_shift)) {
+               *offset_out = offset & mem->page_mask;
+               offset >>= mem->page_shift;
+               *n_out = offset & mem->map_mask;
+               *m_out = offset >> mem->map_shift;
+       } else {
+               map_index = 0;
+               buf_index = 0;
+
+               length = mem->map[map_index]->buf[buf_index].size;
+
+               while (offset >= length) {
+                       offset -= length;
+                       buf_index++;
+
+                       if (buf_index == RXE_BUF_PER_MAP) {
+                               map_index++;
+                               buf_index = 0;
+                       }
+                       length = mem->map[map_index]->buf[buf_index].size;
+               }
+
+               *m_out = map_index;
+               *n_out = buf_index;
+               *offset_out = offset;
+       }
+}
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length)
+{
+       size_t offset;
+       int m, n;
+       void *addr;
+
+       if (mem->state != RXE_MEM_STATE_VALID) {
+               pr_warn("mem not in valid state\n");
+               addr = NULL;
+               goto out;
+       }
+
+       if (!mem->map) {
+               addr = (void *)(uintptr_t)iova;
+               goto out;
+       }
+
+       if (mem_check_range(mem, iova, length)) {
+               pr_warn("range violation\n");
+               addr = NULL;
+               goto out;
+       }
+
+       lookup_iova(mem, iova, &m, &n, &offset);
+
+       if (offset + length > mem->map[m]->buf[n].size) {
+               pr_warn("crosses page boundary\n");
+               addr = NULL;
+               goto out;
+       }
+
+       addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset;
+
+out:
+       return addr;
+}
+
+/* copy data from a range (vaddr, vaddr+length-1) to or from
+ * a mem object starting at iova. Compute incremental value of
+ * crc32 if crcp is not zero. caller must hold a reference to mem
+ */
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
+                enum copy_direction dir, u32 *crcp)
+{
+       int                     err;
+       int                     bytes;
+       u8                      *va;
+       struct rxe_map          **map;
+       struct rxe_phys_buf     *buf;
+       int                     m;
+       int                     i;
+       size_t                  offset;
+       u32                     crc = crcp ? (*crcp) : 0;
+
+       if (mem->type == RXE_MEM_TYPE_DMA) {
+               u8 *src, *dest;
+
+               src  = (dir == to_mem_obj) ?
+                       addr : ((void *)(uintptr_t)iova);
+
+               dest = (dir == to_mem_obj) ?
+                       ((void *)(uintptr_t)iova) : addr;
+
+               if (crcp)
+                       *crcp = crc32_le(*crcp, src, length);
+
+               memcpy(dest, src, length);
+
+               return 0;
+       }
+
+       WARN_ON(!mem->map);
+
+       err = mem_check_range(mem, iova, length);
+       if (err) {
+               err = -EFAULT;
+               goto err1;
+       }
+
+       lookup_iova(mem, iova, &m, &i, &offset);
+
+       map     = mem->map + m;
+       buf     = map[0]->buf + i;
+
+       while (length > 0) {
+               u8 *src, *dest;
+
+               va      = (u8 *)(uintptr_t)buf->addr + offset;
+               src  = (dir == to_mem_obj) ? addr : va;
+               dest = (dir == to_mem_obj) ? va : addr;
+
+               bytes   = buf->size - offset;
+
+               if (bytes > length)
+                       bytes = length;
+
+               if (crcp)
+                       crc = crc32_le(crc, src, bytes);
+
+               memcpy(dest, src, bytes);
+
+               length  -= bytes;
+               addr    += bytes;
+
+               offset  = 0;
+               buf++;
+               i++;
+
+               if (i == RXE_BUF_PER_MAP) {
+                       i = 0;
+                       map++;
+                       buf = map[0]->buf;
+               }
+       }
+
+       if (crcp)
+               *crcp = crc;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+/* copy data in or out of a wqe, i.e. sg list
+ * under the control of a dma descriptor
+ */
+int copy_data(
+       struct rxe_dev          *rxe,
+       struct rxe_pd           *pd,
+       int                     access,
+       struct rxe_dma_info     *dma,
+       void                    *addr,
+       int                     length,
+       enum copy_direction     dir,
+       u32                     *crcp)
+{
+       int                     bytes;
+       struct rxe_sge          *sge    = &dma->sge[dma->cur_sge];
+       int                     offset  = dma->sge_offset;
+       int                     resid   = dma->resid;
+       struct rxe_mem          *mem    = NULL;
+       u64                     iova;
+       int                     err;
+
+       if (length == 0)
+               return 0;
+
+       if (length > resid) {
+               err = -EINVAL;
+               goto err2;
+       }
+
+       if (sge->length && (offset < sge->length)) {
+               mem = lookup_mem(pd, access, sge->lkey, lookup_local);
+               if (!mem) {
+                       err = -EINVAL;
+                       goto err1;
+               }
+       }
+
+       while (length > 0) {
+               bytes = length;
+
+               if (offset >= sge->length) {
+                       if (mem) {
+                               rxe_drop_ref(mem);
+                               mem = NULL;
+                       }
+                       sge++;
+                       dma->cur_sge++;
+                       offset = 0;
+
+                       if (dma->cur_sge >= dma->num_sge) {
+                               err = -ENOSPC;
+                               goto err2;
+                       }
+
+                       if (sge->length) {
+                               mem = lookup_mem(pd, access, sge->lkey,
+                                                lookup_local);
+                               if (!mem) {
+                                       err = -EINVAL;
+                                       goto err1;
+                               }
+                       } else {
+                               continue;
+                       }
+               }
+
+               if (bytes > sge->length - offset)
+                       bytes = sge->length - offset;
+
+               if (bytes > 0) {
+                       iova = sge->addr + offset;
+
+                       err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp);
+                       if (err)
+                               goto err2;
+
+                       offset  += bytes;
+                       resid   -= bytes;
+                       length  -= bytes;
+                       addr    += bytes;
+               }
+       }
+
+       dma->sge_offset = offset;
+       dma->resid      = resid;
+
+       if (mem)
+               rxe_drop_ref(mem);
+
+       return 0;
+
+err2:
+       if (mem)
+               rxe_drop_ref(mem);
+err1:
+       return err;
+}
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
+{
+       struct rxe_sge          *sge    = &dma->sge[dma->cur_sge];
+       int                     offset  = dma->sge_offset;
+       int                     resid   = dma->resid;
+
+       while (length) {
+               unsigned int bytes;
+
+               if (offset >= sge->length) {
+                       sge++;
+                       dma->cur_sge++;
+                       offset = 0;
+                       if (dma->cur_sge >= dma->num_sge)
+                               return -ENOSPC;
+               }
+
+               bytes = length;
+
+               if (bytes > sge->length - offset)
+                       bytes = sge->length - offset;
+
+               offset  += bytes;
+               resid   -= bytes;
+               length  -= bytes;
+       }
+
+       dma->sge_offset = offset;
+       dma->resid      = resid;
+
+       return 0;
+}
+
+/* (1) find the mem (mr or mw) corresponding to lkey/rkey
+ *     depending on lookup_type
+ * (2) verify that the (qp) pd matches the mem pd
+ * (3) verify that the mem can support the requested access
+ * (4) verify that mem state is valid
+ */
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+                          enum lookup_type type)
+{
+       struct rxe_mem *mem;
+       struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
+       int index = key >> 8;
+
+       if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) {
+               mem = rxe_pool_get_index(&rxe->mr_pool, index);
+               if (!mem)
+                       goto err1;
+       } else {
+               goto err1;
+       }
+
+       if ((type == lookup_local && mem->lkey != key) ||
+           (type == lookup_remote && mem->rkey != key))
+               goto err2;
+
+       if (mem->pd != pd)
+               goto err2;
+
+       if (access && !(access & mem->access))
+               goto err2;
+
+       if (mem->state != RXE_MEM_STATE_VALID)
+               goto err2;
+
+       return mem;
+
+err2:
+       rxe_drop_ref(mem);
+err1:
+       return NULL;
+}
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+                     u64 *page, int num_pages, u64 iova)
+{
+       int i;
+       int num_buf;
+       int err;
+       struct rxe_map **map;
+       struct rxe_phys_buf *buf;
+       int page_size;
+
+       if (num_pages > mem->max_buf) {
+               err = -EINVAL;
+               goto err1;
+       }
+
+       num_buf         = 0;
+       page_size       = 1 << mem->page_shift;
+       map             = mem->map;
+       buf             = map[0]->buf;
+
+       for (i = 0; i < num_pages; i++) {
+               buf->addr = *page++;
+               buf->size = page_size;
+               buf++;
+               num_buf++;
+
+               if (num_buf == RXE_BUF_PER_MAP) {
+                       map++;
+                       buf = map[0]->buf;
+                       num_buf = 0;
+               }
+       }
+
+       mem->iova       = iova;
+       mem->va         = iova;
+       mem->length     = num_pages << mem->page_shift;
+       mem->state      = RXE_MEM_STATE_VALID;
+
+       return 0;
+
+err1:
+       return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
new file mode 100644 (file)
index 0000000..0b8d2ea
--- /dev/null
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <net/udp_tunnel.h>
+#include <net/sch_generic.h>
+#include <linux/netfilter.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe.h"
+#include "rxe_net.h"
+#include "rxe_loc.h"
+
+static LIST_HEAD(rxe_dev_list);
+static spinlock_t dev_list_lock; /* spinlock for device list */
+
+struct rxe_dev *net_to_rxe(struct net_device *ndev)
+{
+       struct rxe_dev *rxe;
+       struct rxe_dev *found = NULL;
+
+       spin_lock_bh(&dev_list_lock);
+       list_for_each_entry(rxe, &rxe_dev_list, list) {
+               if (rxe->ndev == ndev) {
+                       found = rxe;
+                       break;
+               }
+       }
+       spin_unlock_bh(&dev_list_lock);
+
+       return found;
+}
+
+struct rxe_dev *get_rxe_by_name(const char* name)
+{
+       struct rxe_dev *rxe;
+       struct rxe_dev *found = NULL;
+
+       spin_lock_bh(&dev_list_lock);
+       list_for_each_entry(rxe, &rxe_dev_list, list) {
+               if (!strcmp(name, rxe->ib_dev.name)) {
+                       found = rxe;
+                       break;
+               }
+       }
+       spin_unlock_bh(&dev_list_lock);
+       return found;
+}
+
+
+struct rxe_recv_sockets recv_sockets;
+
+static __be64 rxe_mac_to_eui64(struct net_device *ndev)
+{
+       unsigned char *mac_addr = ndev->dev_addr;
+       __be64 eui64;
+       unsigned char *dst = (unsigned char *)&eui64;
+
+       dst[0] = mac_addr[0] ^ 2;
+       dst[1] = mac_addr[1];
+       dst[2] = mac_addr[2];
+       dst[3] = 0xff;
+       dst[4] = 0xfe;
+       dst[5] = mac_addr[3];
+       dst[6] = mac_addr[4];
+       dst[7] = mac_addr[5];
+
+       return eui64;
+}
+
+static __be64 node_guid(struct rxe_dev *rxe)
+{
+       return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static __be64 port_guid(struct rxe_dev *rxe)
+{
+       return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static struct device *dma_device(struct rxe_dev *rxe)
+{
+       struct net_device *ndev;
+
+       ndev = rxe->ndev;
+
+       if (ndev->priv_flags & IFF_802_1Q_VLAN)
+               ndev = vlan_dev_real_dev(ndev);
+
+       return ndev->dev.parent;
+}
+
+static int mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+       int err;
+       unsigned char ll_addr[ETH_ALEN];
+
+       ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+       err = dev_mc_add(rxe->ndev, ll_addr);
+
+       return err;
+}
+
+static int mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+       int err;
+       unsigned char ll_addr[ETH_ALEN];
+
+       ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+       err = dev_mc_del(rxe->ndev, ll_addr);
+
+       return err;
+}
+
+static struct dst_entry *rxe_find_route4(struct net_device *ndev,
+                                 struct in_addr *saddr,
+                                 struct in_addr *daddr)
+{
+       struct rtable *rt;
+       struct flowi4 fl = { { 0 } };
+
+       memset(&fl, 0, sizeof(fl));
+       fl.flowi4_oif = ndev->ifindex;
+       memcpy(&fl.saddr, saddr, sizeof(*saddr));
+       memcpy(&fl.daddr, daddr, sizeof(*daddr));
+       fl.flowi4_proto = IPPROTO_UDP;
+
+       rt = ip_route_output_key(&init_net, &fl);
+       if (IS_ERR(rt)) {
+               pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr);
+               return NULL;
+       }
+
+       return &rt->dst;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+                                        struct in6_addr *saddr,
+                                        struct in6_addr *daddr)
+{
+       struct dst_entry *ndst;
+       struct flowi6 fl6 = { { 0 } };
+
+       memset(&fl6, 0, sizeof(fl6));
+       fl6.flowi6_oif = ndev->ifindex;
+       memcpy(&fl6.saddr, saddr, sizeof(*saddr));
+       memcpy(&fl6.daddr, daddr, sizeof(*daddr));
+       fl6.flowi6_proto = IPPROTO_UDP;
+
+       if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk),
+                                               recv_sockets.sk6->sk, &ndst, &fl6))) {
+               pr_err_ratelimited("no route to %pI6\n", daddr);
+               goto put;
+       }
+
+       if (unlikely(ndst->error)) {
+               pr_err("no route to %pI6\n", daddr);
+               goto put;
+       }
+
+       return ndst;
+put:
+       dst_release(ndst);
+       return NULL;
+}
+
+#else
+
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+                                        struct in6_addr *saddr,
+                                        struct in6_addr *daddr)
+{
+       return NULL;
+}
+
+#endif
+
+static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+       struct udphdr *udph;
+       struct net_device *ndev = skb->dev;
+       struct rxe_dev *rxe = net_to_rxe(ndev);
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       if (!rxe)
+               goto drop;
+
+       if (skb_linearize(skb)) {
+               pr_err("skb_linearize failed\n");
+               goto drop;
+       }
+
+       udph = udp_hdr(skb);
+       pkt->rxe = rxe;
+       pkt->port_num = 1;
+       pkt->hdr = (u8 *)(udph + 1);
+       pkt->mask = RXE_GRH_MASK;
+       pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph);
+
+       return rxe_rcv(skb);
+drop:
+       kfree_skb(skb);
+       return 0;
+}
+
+static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
+                                          bool ipv6)
+{
+       int err;
+       struct socket *sock;
+       struct udp_port_cfg udp_cfg;
+       struct udp_tunnel_sock_cfg tnl_cfg;
+
+       memset(&udp_cfg, 0, sizeof(udp_cfg));
+
+       if (ipv6) {
+               udp_cfg.family = AF_INET6;
+               udp_cfg.ipv6_v6only = 1;
+       } else {
+               udp_cfg.family = AF_INET;
+       }
+
+       udp_cfg.local_udp_port = port;
+
+       /* Create UDP socket */
+       err = udp_sock_create(net, &udp_cfg, &sock);
+       if (err < 0) {
+               pr_err("failed to create udp socket. err = %d\n", err);
+               return ERR_PTR(err);
+       }
+
+       tnl_cfg.sk_user_data = NULL;
+       tnl_cfg.encap_type = 1;
+       tnl_cfg.encap_rcv = rxe_udp_encap_recv;
+       tnl_cfg.encap_destroy = NULL;
+
+       /* Setup UDP tunnel */
+       setup_udp_tunnel_sock(net, sock, &tnl_cfg);
+
+       return sock;
+}
+
+static void rxe_release_udp_tunnel(struct socket *sk)
+{
+       udp_tunnel_sock_release(sk);
+}
+
+static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
+                           __be16 dst_port)
+{
+       struct udphdr *udph;
+
+       __skb_push(skb, sizeof(*udph));
+       skb_reset_transport_header(skb);
+       udph = udp_hdr(skb);
+
+       udph->dest = dst_port;
+       udph->source = src_port;
+       udph->len = htons(skb->len);
+       udph->check = 0;
+}
+
+static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb,
+                            __be32 saddr, __be32 daddr, __u8 proto,
+                            __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+       struct iphdr *iph;
+
+       skb_scrub_packet(skb, xnet);
+
+       skb_clear_hash(skb);
+       skb_dst_set(skb, dst);
+       memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+       skb_push(skb, sizeof(struct iphdr));
+       skb_reset_network_header(skb);
+
+       iph = ip_hdr(skb);
+
+       iph->version    =       IPVERSION;
+       iph->ihl        =       sizeof(struct iphdr) >> 2;
+       iph->frag_off   =       df;
+       iph->protocol   =       proto;
+       iph->tos        =       tos;
+       iph->daddr      =       daddr;
+       iph->saddr      =       saddr;
+       iph->ttl        =       ttl;
+       __ip_select_ident(dev_net(dst->dev), iph,
+                         skb_shinfo(skb)->gso_segs ?: 1);
+       iph->tot_len = htons(skb->len);
+       ip_send_check(iph);
+}
+
+static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
+                            struct in6_addr *saddr, struct in6_addr *daddr,
+                            __u8 proto, __u8 prio, __u8 ttl)
+{
+       struct ipv6hdr *ip6h;
+
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+       IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
+                           | IPSKB_REROUTED);
+       skb_dst_set(skb, dst);
+
+       __skb_push(skb, sizeof(*ip6h));
+       skb_reset_network_header(skb);
+       ip6h              = ipv6_hdr(skb);
+       ip6_flow_hdr(ip6h, prio, htonl(0));
+       ip6h->payload_len = htons(skb->len);
+       ip6h->nexthdr     = proto;
+       ip6h->hop_limit   = ttl;
+       ip6h->daddr       = *daddr;
+       ip6h->saddr       = *saddr;
+       ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
+}
+
+static int prepare4(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av)
+{
+       struct dst_entry *dst;
+       bool xnet = false;
+       __be16 df = htons(IP_DF);
+       struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+       struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       dst = rxe_find_route4(rxe->ndev, saddr, daddr);
+       if (!dst) {
+               pr_err("Host not reachable\n");
+               return -EHOSTUNREACH;
+       }
+
+       if (!memcmp(saddr, daddr, sizeof(*daddr)))
+               pkt->mask |= RXE_LOOPBACK_MASK;
+
+       prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+                       htons(ROCE_V2_UDP_DPORT));
+
+       prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
+                        av->grh.traffic_class, av->grh.hop_limit, df, xnet);
+       return 0;
+}
+
+static int prepare6(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av)
+{
+       struct dst_entry *dst;
+       struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
+       struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       dst = rxe_find_route6(rxe->ndev, saddr, daddr);
+       if (!dst) {
+               pr_err("Host not reachable\n");
+               return -EHOSTUNREACH;
+       }
+
+       if (!memcmp(saddr, daddr, sizeof(*daddr)))
+               pkt->mask |= RXE_LOOPBACK_MASK;
+
+       prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+                       htons(ROCE_V2_UDP_DPORT));
+
+       prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
+                        av->grh.traffic_class,
+                        av->grh.hop_limit);
+       return 0;
+}
+
+static int prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                  struct sk_buff *skb, u32 *crc)
+{
+       int err = 0;
+       struct rxe_av *av = rxe_get_av(pkt);
+
+       if (av->network_type == RDMA_NETWORK_IPV4)
+               err = prepare4(rxe, skb, av);
+       else if (av->network_type == RDMA_NETWORK_IPV6)
+               err = prepare6(rxe, skb, av);
+
+       *crc = rxe_icrc_hdr(pkt, skb);
+
+       return err;
+}
+
+static void rxe_skb_tx_dtor(struct sk_buff *skb)
+{
+       struct sock *sk = skb->sk;
+       struct rxe_qp *qp = sk->sk_user_data;
+       int skb_out = atomic_dec_return(&qp->skb_out);
+
+       if (unlikely(qp->need_req_skb &&
+                    skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
+               rxe_run_task(&qp->req.task, 1);
+}
+
+static int send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+               struct sk_buff *skb)
+{
+       struct sk_buff *nskb;
+       struct rxe_av *av;
+       int err;
+
+       av = rxe_get_av(pkt);
+
+       nskb = skb_clone(skb, GFP_ATOMIC);
+       if (!nskb)
+               return -ENOMEM;
+
+       nskb->destructor = rxe_skb_tx_dtor;
+       nskb->sk = pkt->qp->sk->sk;
+
+       if (av->network_type == RDMA_NETWORK_IPV4) {
+               err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb);
+       } else if (av->network_type == RDMA_NETWORK_IPV6) {
+               err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb);
+       } else {
+               pr_err("Unknown layer 3 protocol: %d\n", av->network_type);
+               kfree_skb(nskb);
+               return -EINVAL;
+       }
+
+       if (unlikely(net_xmit_eval(err))) {
+               pr_debug("error sending packet: %d\n", err);
+               return -EAGAIN;
+       }
+
+       kfree_skb(skb);
+
+       return 0;
+}
+
+static int loopback(struct sk_buff *skb)
+{
+       return rxe_rcv(skb);
+}
+
+static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av)
+{
+       return rxe->port.port_guid == av->grh.dgid.global.interface_id;
+}
+
+static struct sk_buff *init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+                                  int paylen, struct rxe_pkt_info *pkt)
+{
+       unsigned int hdr_len;
+       struct sk_buff *skb;
+
+       if (av->network_type == RDMA_NETWORK_IPV4)
+               hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+                       sizeof(struct iphdr);
+       else
+               hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+                       sizeof(struct ipv6hdr);
+
+       skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(rxe->ndev),
+                       GFP_ATOMIC);
+       if (unlikely(!skb))
+               return NULL;
+
+       skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev));
+
+       skb->dev        = rxe->ndev;
+       if (av->network_type == RDMA_NETWORK_IPV4)
+               skb->protocol = htons(ETH_P_IP);
+       else
+               skb->protocol = htons(ETH_P_IPV6);
+
+       pkt->rxe        = rxe;
+       pkt->port_num   = 1;
+       pkt->hdr        = skb_put(skb, paylen);
+       pkt->mask       |= RXE_GRH_MASK;
+
+       memset(pkt->hdr, 0, paylen);
+
+       return skb;
+}
+
+/*
+ * this is required by rxe_cfg to match rxe devices in
+ * /sys/class/infiniband up with their underlying ethernet devices
+ */
+static char *parent_name(struct rxe_dev *rxe, unsigned int port_num)
+{
+       return rxe->ndev->name;
+}
+
+static enum rdma_link_layer link_layer(struct rxe_dev *rxe,
+                                      unsigned int port_num)
+{
+       return IB_LINK_LAYER_ETHERNET;
+}
+
+static struct rxe_ifc_ops ifc_ops = {
+       .node_guid      = node_guid,
+       .port_guid      = port_guid,
+       .dma_device     = dma_device,
+       .mcast_add      = mcast_add,
+       .mcast_delete   = mcast_delete,
+       .prepare        = prepare,
+       .send           = send,
+       .loopback       = loopback,
+       .init_packet    = init_packet,
+       .parent_name    = parent_name,
+       .link_layer     = link_layer,
+};
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev)
+{
+       int err;
+       struct rxe_dev *rxe = NULL;
+
+       rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+       if (!rxe)
+               return NULL;
+
+       rxe->ifc_ops = &ifc_ops;
+       rxe->ndev = ndev;
+
+       err = rxe_add(rxe, ndev->mtu);
+       if (err) {
+               ib_dealloc_device(&rxe->ib_dev);
+               return NULL;
+       }
+
+       spin_lock_bh(&dev_list_lock);
+       list_add_tail(&rxe_dev_list, &rxe->list);
+       spin_unlock_bh(&dev_list_lock);
+       return rxe;
+}
+
+void rxe_remove_all(void)
+{
+       spin_lock_bh(&dev_list_lock);
+       while (!list_empty(&rxe_dev_list)) {
+               struct rxe_dev *rxe =
+                       list_first_entry(&rxe_dev_list, struct rxe_dev, list);
+
+               list_del(&rxe->list);
+               spin_unlock_bh(&dev_list_lock);
+               rxe_remove(rxe);
+               spin_lock_bh(&dev_list_lock);
+       }
+       spin_unlock_bh(&dev_list_lock);
+}
+EXPORT_SYMBOL(rxe_remove_all);
+
+static void rxe_port_event(struct rxe_dev *rxe,
+                          enum ib_event_type event)
+{
+       struct ib_event ev;
+
+       ev.device = &rxe->ib_dev;
+       ev.element.port_num = 1;
+       ev.event = event;
+
+       ib_dispatch_event(&ev);
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_up(struct rxe_dev *rxe)
+{
+       struct rxe_port *port;
+
+       port = &rxe->port;
+       port->attr.state = IB_PORT_ACTIVE;
+       port->attr.phys_state = IB_PHYS_STATE_LINK_UP;
+
+       rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
+       pr_info("rxe: set %s active\n", rxe->ib_dev.name);
+       return;
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_down(struct rxe_dev *rxe)
+{
+       struct rxe_port *port;
+
+       port = &rxe->port;
+       port->attr.state = IB_PORT_DOWN;
+       port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN;
+
+       rxe_port_event(rxe, IB_EVENT_PORT_ERR);
+       pr_info("rxe: set %s down\n", rxe->ib_dev.name);
+       return;
+}
+
+static int rxe_notify(struct notifier_block *not_blk,
+                     unsigned long event,
+                     void *arg)
+{
+       struct net_device *ndev = netdev_notifier_info_to_dev(arg);
+       struct rxe_dev *rxe = net_to_rxe(ndev);
+
+       if (!rxe)
+               goto out;
+
+       switch (event) {
+       case NETDEV_UNREGISTER:
+               list_del(&rxe->list);
+               rxe_remove(rxe);
+               break;
+       case NETDEV_UP:
+               rxe_port_up(rxe);
+               break;
+       case NETDEV_DOWN:
+               rxe_port_down(rxe);
+               break;
+       case NETDEV_CHANGEMTU:
+               pr_info("rxe: %s changed mtu to %d\n", ndev->name, ndev->mtu);
+               rxe_set_mtu(rxe, ndev->mtu);
+               break;
+       case NETDEV_REBOOT:
+       case NETDEV_CHANGE:
+       case NETDEV_GOING_DOWN:
+       case NETDEV_CHANGEADDR:
+       case NETDEV_CHANGENAME:
+       case NETDEV_FEAT_CHANGE:
+       default:
+               pr_info("rxe: ignoring netdev event = %ld for %s\n",
+                       event, ndev->name);
+               break;
+       }
+out:
+       return NOTIFY_OK;
+}
+
+static struct notifier_block rxe_net_notifier = {
+       .notifier_call = rxe_notify,
+};
+
+int rxe_net_init(void)
+{
+       int err;
+
+       spin_lock_init(&dev_list_lock);
+
+       recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
+                       htons(ROCE_V2_UDP_DPORT), true);
+       if (IS_ERR(recv_sockets.sk6)) {
+               recv_sockets.sk6 = NULL;
+               pr_err("rxe: Failed to create IPv6 UDP tunnel\n");
+               return -1;
+       }
+
+       recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
+                       htons(ROCE_V2_UDP_DPORT), false);
+       if (IS_ERR(recv_sockets.sk4)) {
+               rxe_release_udp_tunnel(recv_sockets.sk6);
+               recv_sockets.sk4 = NULL;
+               recv_sockets.sk6 = NULL;
+               pr_err("rxe: Failed to create IPv4 UDP tunnel\n");
+               return -1;
+       }
+
+       err = register_netdevice_notifier(&rxe_net_notifier);
+       if (err) {
+               rxe_release_udp_tunnel(recv_sockets.sk6);
+               rxe_release_udp_tunnel(recv_sockets.sk4);
+               pr_err("rxe: Failed to rigister netdev notifier\n");
+       }
+
+       return err;
+}
+
+void rxe_net_exit(void)
+{
+       if (recv_sockets.sk6)
+               rxe_release_udp_tunnel(recv_sockets.sk6);
+
+       if (recv_sockets.sk4)
+               rxe_release_udp_tunnel(recv_sockets.sk4);
+
+       unregister_netdevice_notifier(&rxe_net_notifier);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
new file mode 100644 (file)
index 0000000..7b06f76
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_NET_H
+#define RXE_NET_H
+
+#include <net/sock.h>
+#include <net/if_inet6.h>
+#include <linux/module.h>
+
+struct rxe_recv_sockets {
+       struct socket *sk4;
+       struct socket *sk6;
+};
+
+extern struct rxe_recv_sockets recv_sockets;
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev);
+
+int rxe_net_init(void);
+void rxe_net_exit(void);
+
+#endif /* RXE_NET_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
new file mode 100644 (file)
index 0000000..61927c1
--- /dev/null
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_pack.h>
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+
+/* useful information about work request opcodes and pkt opcodes in
+ * table form
+ */
+struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
+       [IB_WR_RDMA_WRITE]                              = {
+               .name   = "IB_WR_RDMA_WRITE",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+               },
+       },
+       [IB_WR_RDMA_WRITE_WITH_IMM]                     = {
+               .name   = "IB_WR_RDMA_WRITE_WITH_IMM",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+               },
+       },
+       [IB_WR_SEND]                                    = {
+               .name   = "IB_WR_SEND",
+               .mask   = {
+                       [IB_QPT_SMI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_GSI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UD]     = WR_INLINE_MASK | WR_SEND_MASK,
+               },
+       },
+       [IB_WR_SEND_WITH_IMM]                           = {
+               .name   = "IB_WR_SEND_WITH_IMM",
+               .mask   = {
+                       [IB_QPT_SMI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_GSI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UD]     = WR_INLINE_MASK | WR_SEND_MASK,
+               },
+       },
+       [IB_WR_RDMA_READ]                               = {
+               .name   = "IB_WR_RDMA_READ",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_READ_MASK,
+               },
+       },
+       [IB_WR_ATOMIC_CMP_AND_SWP]                      = {
+               .name   = "IB_WR_ATOMIC_CMP_AND_SWP",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_ATOMIC_MASK,
+               },
+       },
+       [IB_WR_ATOMIC_FETCH_AND_ADD]                    = {
+               .name   = "IB_WR_ATOMIC_FETCH_AND_ADD",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_ATOMIC_MASK,
+               },
+       },
+       [IB_WR_LSO]                                     = {
+               .name   = "IB_WR_LSO",
+               .mask   = {
+                       /* not supported */
+               },
+       },
+       [IB_WR_SEND_WITH_INV]                           = {
+               .name   = "IB_WR_SEND_WITH_INV",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UD]     = WR_INLINE_MASK | WR_SEND_MASK,
+               },
+       },
+       [IB_WR_RDMA_READ_WITH_INV]                      = {
+               .name   = "IB_WR_RDMA_READ_WITH_INV",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_READ_MASK,
+               },
+       },
+       [IB_WR_LOCAL_INV]                               = {
+               .name   = "IB_WR_LOCAL_INV",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_REG_MASK,
+               },
+       },
+       [IB_WR_REG_MR]                                  = {
+               .name   = "IB_WR_REG_MR",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_REG_MASK,
+               },
+       },
+};
+
+struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
+       [IB_OPCODE_RC_SEND_FIRST]                       = {
+               .name   = "IB_OPCODE_RC_SEND_FIRST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_MIDDLE]              = {
+               .name   = "IB_OPCODE_RC_SEND_MIDDLE]",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_LAST]                        = {
+               .name   = "IB_OPCODE_RC_SEND_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_RC_SEND_ONLY",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]         = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_FIRST",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                  = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                  = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_ONLY",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]                        = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_REQUEST",
+               .mask   = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]         = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST",
+               .mask   = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]          = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST",
+               .mask   = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]          = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY",
+               .mask   = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_ACKNOWLEDGE]                      = {
+               .name   = "IB_OPCODE_RC_ACKNOWLEDGE",
+               .mask   = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]                       = {
+               .name   = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE",
+               .mask   = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_ATMACK]    = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                       + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_COMPARE_SWAP]                     = {
+               .name   = "IB_OPCODE_RC_COMPARE_SWAP",
+               .mask   = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_ATMETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_FETCH_ADD]                        = {
+               .name   = "IB_OPCODE_RC_FETCH_ADD",
+               .mask   = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_ATMETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]                = {
+               .name   = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE",
+               .mask   = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]                = {
+               .name   = "IB_OPCODE_RC_SEND_ONLY_INV",
+               .mask   = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IETH_BYTES,
+               }
+       },
+
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                       = {
+               .name   = "IB_OPCODE_UC_SEND_FIRST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_MIDDLE]              = {
+               .name   = "IB_OPCODE_UC_SEND_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_LAST]                        = {
+               .name   = "IB_OPCODE_UC_SEND_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_UC_SEND_ONLY",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]         = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_FIRST",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]                = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                  = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                  = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_ONLY",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+
+       /* RD */
+       [IB_OPCODE_RD_SEND_FIRST]                       = {
+               .name   = "IB_OPCODE_RD_SEND_FIRST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_MIDDLE]              = {
+               .name   = "IB_OPCODE_RD_SEND_MIDDLE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_SEND_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_LAST]                        = {
+               .name   = "IB_OPCODE_RD_SEND_LAST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_RD_SEND_ONLY",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_FIRST]         = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_FIRST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_LAST]                  = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_LAST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_ONLY]                  = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_ONLY",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES
+                               + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_REQUEST]                        = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_REQUEST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_REQ_MASK | RXE_READ_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST]         = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK
+                               | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE",
+               .mask   = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST]          = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_ACK_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY]          = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_ACKNOWLEDGE]                      = {
+               .name   = "IB_OPCODE_RD_ACKNOWLEDGE",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE]                       = {
+               .name   = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK
+                               | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_ATMACK]    = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_COMPARE_SWAP]                     = {
+               .name   = "RD_COMPARE_SWAP",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+                               | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES +
+                                               + RXE_ATMETH_BYTES
+                                               + RXE_DETH_BYTES +
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_FETCH_ADD]                        = {
+               .name   = "IB_OPCODE_RD_FETCH_ADD",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+                               | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES +
+                                               + RXE_ATMETH_BYTES
+                                               + RXE_DETH_BYTES +
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_UD_SEND_ONLY",
+               .mask   = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_DETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_DETH]      = RXE_BTH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
new file mode 100644 (file)
index 0000000..307604e
--- /dev/null
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_OPCODE_H
+#define RXE_OPCODE_H
+
+/*
+ * contains header bit mask definitions and header lengths
+ * declaration of the rxe_opcode_info struct and
+ * rxe_wr_opcode_info struct
+ */
+
+enum rxe_wr_mask {
+       WR_INLINE_MASK                  = BIT(0),
+       WR_ATOMIC_MASK                  = BIT(1),
+       WR_SEND_MASK                    = BIT(2),
+       WR_READ_MASK                    = BIT(3),
+       WR_WRITE_MASK                   = BIT(4),
+       WR_LOCAL_MASK                   = BIT(5),
+       WR_REG_MASK                     = BIT(6),
+
+       WR_READ_OR_WRITE_MASK           = WR_READ_MASK | WR_WRITE_MASK,
+       WR_READ_WRITE_OR_SEND_MASK      = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+       WR_WRITE_OR_SEND_MASK           = WR_WRITE_MASK | WR_SEND_MASK,
+       WR_ATOMIC_OR_READ_MASK          = WR_ATOMIC_MASK | WR_READ_MASK,
+};
+
+#define WR_MAX_QPT             (8)
+
+struct rxe_wr_opcode_info {
+       char                    *name;
+       enum rxe_wr_mask        mask[WR_MAX_QPT];
+};
+
+extern struct rxe_wr_opcode_info rxe_wr_opcode_info[];
+
+enum rxe_hdr_type {
+       RXE_LRH,
+       RXE_GRH,
+       RXE_BTH,
+       RXE_RETH,
+       RXE_AETH,
+       RXE_ATMETH,
+       RXE_ATMACK,
+       RXE_IETH,
+       RXE_RDETH,
+       RXE_DETH,
+       RXE_IMMDT,
+       RXE_PAYLOAD,
+       NUM_HDR_TYPES
+};
+
+enum rxe_hdr_mask {
+       RXE_LRH_MASK            = BIT(RXE_LRH),
+       RXE_GRH_MASK            = BIT(RXE_GRH),
+       RXE_BTH_MASK            = BIT(RXE_BTH),
+       RXE_IMMDT_MASK          = BIT(RXE_IMMDT),
+       RXE_RETH_MASK           = BIT(RXE_RETH),
+       RXE_AETH_MASK           = BIT(RXE_AETH),
+       RXE_ATMETH_MASK         = BIT(RXE_ATMETH),
+       RXE_ATMACK_MASK         = BIT(RXE_ATMACK),
+       RXE_IETH_MASK           = BIT(RXE_IETH),
+       RXE_RDETH_MASK          = BIT(RXE_RDETH),
+       RXE_DETH_MASK           = BIT(RXE_DETH),
+       RXE_PAYLOAD_MASK        = BIT(RXE_PAYLOAD),
+
+       RXE_REQ_MASK            = BIT(NUM_HDR_TYPES + 0),
+       RXE_ACK_MASK            = BIT(NUM_HDR_TYPES + 1),
+       RXE_SEND_MASK           = BIT(NUM_HDR_TYPES + 2),
+       RXE_WRITE_MASK          = BIT(NUM_HDR_TYPES + 3),
+       RXE_READ_MASK           = BIT(NUM_HDR_TYPES + 4),
+       RXE_ATOMIC_MASK         = BIT(NUM_HDR_TYPES + 5),
+
+       RXE_RWR_MASK            = BIT(NUM_HDR_TYPES + 6),
+       RXE_COMP_MASK           = BIT(NUM_HDR_TYPES + 7),
+
+       RXE_START_MASK          = BIT(NUM_HDR_TYPES + 8),
+       RXE_MIDDLE_MASK         = BIT(NUM_HDR_TYPES + 9),
+       RXE_END_MASK            = BIT(NUM_HDR_TYPES + 10),
+
+       RXE_LOOPBACK_MASK       = BIT(NUM_HDR_TYPES + 12),
+
+       RXE_READ_OR_ATOMIC      = (RXE_READ_MASK | RXE_ATOMIC_MASK),
+       RXE_WRITE_OR_SEND       = (RXE_WRITE_MASK | RXE_SEND_MASK),
+};
+
+#define OPCODE_NONE            (-1)
+#define RXE_NUM_OPCODE         256
+
+struct rxe_opcode_info {
+       char                    *name;
+       enum rxe_hdr_mask       mask;
+       int                     length;
+       int                     offset[NUM_HDR_TYPES];
+};
+
+extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE];
+
+#endif /* RXE_OPCODE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
new file mode 100644 (file)
index 0000000..f459c43
--- /dev/null
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_PARAM_H
+#define RXE_PARAM_H
+
+static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu)
+{
+       if (mtu < 256)
+               return 0;
+       else if (mtu < 512)
+               return IB_MTU_256;
+       else if (mtu < 1024)
+               return IB_MTU_512;
+       else if (mtu < 2048)
+               return IB_MTU_1024;
+       else if (mtu < 4096)
+               return IB_MTU_2048;
+       else
+               return IB_MTU_4096;
+}
+
+/* Find the IB mtu for a given network MTU. */
+static inline enum ib_mtu eth_mtu_int_to_enum(int mtu)
+{
+       mtu -= RXE_MAX_HDR_LENGTH;
+
+       return rxe_mtu_int_to_enum(mtu);
+}
+
+/* default/initial rxe device parameter settings */
+enum rxe_device_param {
+       RXE_FW_VER                      = 0,
+       RXE_MAX_MR_SIZE                 = -1ull,
+       RXE_PAGE_SIZE_CAP               = 0xfffff000,
+       RXE_VENDOR_ID                   = 0,
+       RXE_VENDOR_PART_ID              = 0,
+       RXE_HW_VER                      = 0,
+       RXE_MAX_QP                      = 0x10000,
+       RXE_MAX_QP_WR                   = 0x4000,
+       RXE_MAX_INLINE_DATA             = 400,
+       RXE_DEVICE_CAP_FLAGS            = IB_DEVICE_BAD_PKEY_CNTR
+                                       | IB_DEVICE_BAD_QKEY_CNTR
+                                       | IB_DEVICE_AUTO_PATH_MIG
+                                       | IB_DEVICE_CHANGE_PHY_PORT
+                                       | IB_DEVICE_UD_AV_PORT_ENFORCE
+                                       | IB_DEVICE_PORT_ACTIVE_EVENT
+                                       | IB_DEVICE_SYS_IMAGE_GUID
+                                       | IB_DEVICE_RC_RNR_NAK_GEN
+                                       | IB_DEVICE_SRQ_RESIZE
+                                       | IB_DEVICE_MEM_MGT_EXTENSIONS,
+       RXE_MAX_SGE                     = 32,
+       RXE_MAX_SGE_RD                  = 32,
+       RXE_MAX_CQ                      = 16384,
+       RXE_MAX_LOG_CQE                 = 13,
+       RXE_MAX_MR                      = 2 * 1024,
+       RXE_MAX_PD                      = 0x7ffc,
+       RXE_MAX_QP_RD_ATOM              = 128,
+       RXE_MAX_EE_RD_ATOM              = 0,
+       RXE_MAX_RES_RD_ATOM             = 0x3f000,
+       RXE_MAX_QP_INIT_RD_ATOM         = 128,
+       RXE_MAX_EE_INIT_RD_ATOM         = 0,
+       RXE_ATOMIC_CAP                  = 1,
+       RXE_MAX_EE                      = 0,
+       RXE_MAX_RDD                     = 0,
+       RXE_MAX_MW                      = 0,
+       RXE_MAX_RAW_IPV6_QP             = 0,
+       RXE_MAX_RAW_ETHY_QP             = 0,
+       RXE_MAX_MCAST_GRP               = 8192,
+       RXE_MAX_MCAST_QP_ATTACH         = 56,
+       RXE_MAX_TOT_MCAST_QP_ATTACH     = 0x70000,
+       RXE_MAX_AH                      = 100,
+       RXE_MAX_FMR                     = 0,
+       RXE_MAX_MAP_PER_FMR             = 0,
+       RXE_MAX_SRQ                     = 960,
+       RXE_MAX_SRQ_WR                  = 0x4000,
+       RXE_MIN_SRQ_WR                  = 1,
+       RXE_MAX_SRQ_SGE                 = 27,
+       RXE_MIN_SRQ_SGE                 = 1,
+       RXE_MAX_FMR_PAGE_LIST_LEN       = 512,
+       RXE_MAX_PKEYS                   = 64,
+       RXE_LOCAL_CA_ACK_DELAY          = 15,
+
+       RXE_MAX_UCONTEXT                = 512,
+
+       RXE_NUM_PORT                    = 1,
+       RXE_NUM_COMP_VECTORS            = 1,
+
+       RXE_MIN_QP_INDEX                = 16,
+       RXE_MAX_QP_INDEX                = 0x00020000,
+
+       RXE_MIN_SRQ_INDEX               = 0x00020001,
+       RXE_MAX_SRQ_INDEX               = 0x00040000,
+
+       RXE_MIN_MR_INDEX                = 0x00000001,
+       RXE_MAX_MR_INDEX                = 0x00040000,
+       RXE_MIN_MW_INDEX                = 0x00040001,
+       RXE_MAX_MW_INDEX                = 0x00060000,
+       RXE_MAX_PKT_PER_ACK             = 64,
+
+       RXE_MAX_UNACKED_PSNS            = 128,
+
+       /* Max inflight SKBs per queue pair */
+       RXE_INFLIGHT_SKBS_PER_QP_HIGH   = 64,
+       RXE_INFLIGHT_SKBS_PER_QP_LOW    = 16,
+
+       /* Delay before calling arbiter timer */
+       RXE_NSEC_ARB_TIMER_DELAY        = 200,
+};
+
+/* default/initial rxe port parameters */
+enum rxe_port_param {
+       RXE_PORT_STATE                  = IB_PORT_DOWN,
+       RXE_PORT_MAX_MTU                = IB_MTU_4096,
+       RXE_PORT_ACTIVE_MTU             = IB_MTU_256,
+       RXE_PORT_GID_TBL_LEN            = 1024,
+       RXE_PORT_PORT_CAP_FLAGS         = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP,
+       RXE_PORT_MAX_MSG_SZ             = 0x800000,
+       RXE_PORT_BAD_PKEY_CNTR          = 0,
+       RXE_PORT_QKEY_VIOL_CNTR         = 0,
+       RXE_PORT_LID                    = 0,
+       RXE_PORT_SM_LID                 = 0,
+       RXE_PORT_SM_SL                  = 0,
+       RXE_PORT_LMC                    = 0,
+       RXE_PORT_MAX_VL_NUM             = 1,
+       RXE_PORT_SUBNET_TIMEOUT         = 0,
+       RXE_PORT_INIT_TYPE_REPLY        = 0,
+       RXE_PORT_ACTIVE_WIDTH           = IB_WIDTH_1X,
+       RXE_PORT_ACTIVE_SPEED           = 1,
+       RXE_PORT_PKEY_TBL_LEN           = 64,
+       RXE_PORT_PHYS_STATE             = 2,
+       RXE_PORT_SUBNET_PREFIX          = 0xfe80000000000000ULL,
+};
+
+/* default/initial port info parameters */
+enum rxe_port_info_param {
+       RXE_PORT_INFO_VL_CAP            = 4,    /* 1-8 */
+       RXE_PORT_INFO_MTU_CAP           = 5,    /* 4096 */
+       RXE_PORT_INFO_OPER_VL           = 1,    /* 1 */
+};
+
+#endif /* RXE_PARAM_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
new file mode 100644 (file)
index 0000000..6bac071
--- /dev/null
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* info about object pools
+ * note that mr and mw share a single index space
+ * so that one can map an lkey to the correct type of object
+ */
+struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
+       [RXE_TYPE_UC] = {
+               .name           = "rxe-uc",
+               .size           = sizeof(struct rxe_ucontext),
+       },
+       [RXE_TYPE_PD] = {
+               .name           = "rxe-pd",
+               .size           = sizeof(struct rxe_pd),
+       },
+       [RXE_TYPE_AH] = {
+               .name           = "rxe-ah",
+               .size           = sizeof(struct rxe_ah),
+               .flags          = RXE_POOL_ATOMIC,
+       },
+       [RXE_TYPE_SRQ] = {
+               .name           = "rxe-srq",
+               .size           = sizeof(struct rxe_srq),
+               .flags          = RXE_POOL_INDEX,
+               .min_index      = RXE_MIN_SRQ_INDEX,
+               .max_index      = RXE_MAX_SRQ_INDEX,
+       },
+       [RXE_TYPE_QP] = {
+               .name           = "rxe-qp",
+               .size           = sizeof(struct rxe_qp),
+               .cleanup        = rxe_qp_cleanup,
+               .flags          = RXE_POOL_INDEX,
+               .min_index      = RXE_MIN_QP_INDEX,
+               .max_index      = RXE_MAX_QP_INDEX,
+       },
+       [RXE_TYPE_CQ] = {
+               .name           = "rxe-cq",
+               .size           = sizeof(struct rxe_cq),
+               .cleanup        = rxe_cq_cleanup,
+       },
+       [RXE_TYPE_MR] = {
+               .name           = "rxe-mr",
+               .size           = sizeof(struct rxe_mem),
+               .cleanup        = rxe_mem_cleanup,
+               .flags          = RXE_POOL_INDEX,
+               .max_index      = RXE_MAX_MR_INDEX,
+               .min_index      = RXE_MIN_MR_INDEX,
+       },
+       [RXE_TYPE_MW] = {
+               .name           = "rxe-mw",
+               .size           = sizeof(struct rxe_mem),
+               .flags          = RXE_POOL_INDEX,
+               .max_index      = RXE_MAX_MW_INDEX,
+               .min_index      = RXE_MIN_MW_INDEX,
+       },
+       [RXE_TYPE_MC_GRP] = {
+               .name           = "rxe-mc_grp",
+               .size           = sizeof(struct rxe_mc_grp),
+               .cleanup        = rxe_mc_cleanup,
+               .flags          = RXE_POOL_KEY,
+               .key_offset     = offsetof(struct rxe_mc_grp, mgid),
+               .key_size       = sizeof(union ib_gid),
+       },
+       [RXE_TYPE_MC_ELEM] = {
+               .name           = "rxe-mc_elem",
+               .size           = sizeof(struct rxe_mc_elem),
+               .flags          = RXE_POOL_ATOMIC,
+       },
+};
+
+static inline char *pool_name(struct rxe_pool *pool)
+{
+       return rxe_type_info[pool->type].name;
+}
+
+static inline struct kmem_cache *pool_cache(struct rxe_pool *pool)
+{
+       return rxe_type_info[pool->type].cache;
+}
+
+static inline enum rxe_elem_type rxe_type(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+
+       return elem->pool->type;
+}
+
+int rxe_cache_init(void)
+{
+       int err;
+       int i;
+       size_t size;
+       struct rxe_type_info *type;
+
+       for (i = 0; i < RXE_NUM_TYPES; i++) {
+               type = &rxe_type_info[i];
+               size = ALIGN(type->size, RXE_POOL_ALIGN);
+               type->cache = kmem_cache_create(type->name, size,
+                               RXE_POOL_ALIGN,
+                               RXE_POOL_CACHE_FLAGS, NULL);
+               if (!type->cache) {
+                       pr_err("Unable to init kmem cache for %s\n",
+                              type->name);
+                       err = -ENOMEM;
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       while (--i >= 0) {
+               kmem_cache_destroy(type->cache);
+               type->cache = NULL;
+       }
+
+       return err;
+}
+
+void rxe_cache_exit(void)
+{
+       int i;
+       struct rxe_type_info *type;
+
+       for (i = 0; i < RXE_NUM_TYPES; i++) {
+               type = &rxe_type_info[i];
+               kmem_cache_destroy(type->cache);
+               type->cache = NULL;
+       }
+}
+
+static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
+{
+       int err = 0;
+       size_t size;
+
+       if ((max - min + 1) < pool->max_elem) {
+               pr_warn("not enough indices for max_elem\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       pool->max_index = max;
+       pool->min_index = min;
+
+       size = BITS_TO_LONGS(max - min + 1) * sizeof(long);
+       pool->table = kmalloc(size, GFP_KERNEL);
+       if (!pool->table) {
+               pr_warn("no memory for bit table\n");
+               err = -ENOMEM;
+               goto out;
+       }
+
+       pool->table_size = size;
+       bitmap_zero(pool->table, max - min + 1);
+
+out:
+       return err;
+}
+
+int rxe_pool_init(
+       struct rxe_dev          *rxe,
+       struct rxe_pool         *pool,
+       enum rxe_elem_type      type,
+       unsigned                max_elem)
+{
+       int                     err = 0;
+       size_t                  size = rxe_type_info[type].size;
+
+       memset(pool, 0, sizeof(*pool));
+
+       pool->rxe               = rxe;
+       pool->type              = type;
+       pool->max_elem          = max_elem;
+       pool->elem_size         = ALIGN(size, RXE_POOL_ALIGN);
+       pool->flags             = rxe_type_info[type].flags;
+       pool->tree              = RB_ROOT;
+       pool->cleanup           = rxe_type_info[type].cleanup;
+
+       atomic_set(&pool->num_elem, 0);
+
+       kref_init(&pool->ref_cnt);
+
+       spin_lock_init(&pool->pool_lock);
+
+       if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
+               err = rxe_pool_init_index(pool,
+                                         rxe_type_info[type].max_index,
+                                         rxe_type_info[type].min_index);
+               if (err)
+                       goto out;
+       }
+
+       if (rxe_type_info[type].flags & RXE_POOL_KEY) {
+               pool->key_offset = rxe_type_info[type].key_offset;
+               pool->key_size = rxe_type_info[type].key_size;
+       }
+
+       pool->state = rxe_pool_valid;
+
+out:
+       return err;
+}
+
+static void rxe_pool_release(struct kref *kref)
+{
+       struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);
+
+       pool->state = rxe_pool_invalid;
+       kfree(pool->table);
+}
+
+static void rxe_pool_put(struct rxe_pool *pool)
+{
+       kref_put(&pool->ref_cnt, rxe_pool_release);
+}
+
+int rxe_pool_cleanup(struct rxe_pool *pool)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       pool->state = rxe_pool_invalid;
+       if (atomic_read(&pool->num_elem) > 0)
+               pr_warn("%s pool destroyed with unfree'd elem\n",
+                       pool_name(pool));
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+       rxe_pool_put(pool);
+
+       return 0;
+}
+
+static u32 alloc_index(struct rxe_pool *pool)
+{
+       u32 index;
+       u32 range = pool->max_index - pool->min_index + 1;
+
+       index = find_next_zero_bit(pool->table, range, pool->last);
+       if (index >= range)
+               index = find_first_zero_bit(pool->table, range);
+
+       set_bit(index, pool->table);
+       pool->last = index;
+       return index + pool->min_index;
+}
+
+static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+       struct rb_node **link = &pool->tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct rxe_pool_entry *elem;
+
+       while (*link) {
+               parent = *link;
+               elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+               if (elem->index == new->index) {
+                       pr_warn("element already exists!\n");
+                       goto out;
+               }
+
+               if (elem->index > new->index)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, &pool->tree);
+out:
+       return;
+}
+
+static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+       struct rb_node **link = &pool->tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct rxe_pool_entry *elem;
+       int cmp;
+
+       while (*link) {
+               parent = *link;
+               elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+               cmp = memcmp((u8 *)elem + pool->key_offset,
+                            (u8 *)new + pool->key_offset, pool->key_size);
+
+               if (cmp == 0) {
+                       pr_warn("key already exists!\n");
+                       goto out;
+               }
+
+               if (cmp > 0)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, &pool->tree);
+out:
+       return;
+}
+
+void rxe_add_key(void *arg, void *key)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
+       insert_key(pool, elem);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_key(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       rb_erase(&elem->node, &pool->tree);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_add_index(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       elem->index = alloc_index(pool);
+       insert_index(pool, elem);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_index(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       clear_bit(elem->index - pool->min_index, pool->table);
+       rb_erase(&elem->node, &pool->tree);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void *rxe_alloc(struct rxe_pool *pool)
+{
+       struct rxe_pool_entry *elem;
+       unsigned long flags;
+
+       might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       if (pool->state != rxe_pool_valid) {
+               spin_unlock_irqrestore(&pool->pool_lock, flags);
+               return NULL;
+       }
+       kref_get(&pool->ref_cnt);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+       kref_get(&pool->rxe->ref_cnt);
+
+       if (atomic_inc_return(&pool->num_elem) > pool->max_elem) {
+               atomic_dec(&pool->num_elem);
+               rxe_dev_put(pool->rxe);
+               rxe_pool_put(pool);
+               return NULL;
+       }
+
+       elem = kmem_cache_zalloc(pool_cache(pool),
+                                (pool->flags & RXE_POOL_ATOMIC) ?
+                                GFP_ATOMIC : GFP_KERNEL);
+
+       elem->pool = pool;
+       kref_init(&elem->ref_cnt);
+
+       return elem;
+}
+
+void rxe_elem_release(struct kref *kref)
+{
+       struct rxe_pool_entry *elem =
+               container_of(kref, struct rxe_pool_entry, ref_cnt);
+       struct rxe_pool *pool = elem->pool;
+
+       if (pool->cleanup)
+               pool->cleanup(elem);
+
+       kmem_cache_free(pool_cache(pool), elem);
+       atomic_dec(&pool->num_elem);
+       rxe_dev_put(pool->rxe);
+       rxe_pool_put(pool);
+}
+
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
+{
+       struct rb_node *node = NULL;
+       struct rxe_pool_entry *elem = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+
+       if (pool->state != rxe_pool_valid)
+               goto out;
+
+       node = pool->tree.rb_node;
+
+       while (node) {
+               elem = rb_entry(node, struct rxe_pool_entry, node);
+
+               if (elem->index > index)
+                       node = node->rb_left;
+               else if (elem->index < index)
+                       node = node->rb_right;
+               else
+                       break;
+       }
+
+       if (node)
+               kref_get(&elem->ref_cnt);
+
+out:
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+       return node ? (void *)elem : NULL;
+}
+
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key)
+{
+       struct rb_node *node = NULL;
+       struct rxe_pool_entry *elem = NULL;
+       int cmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+
+       if (pool->state != rxe_pool_valid)
+               goto out;
+
+       node = pool->tree.rb_node;
+
+       while (node) {
+               elem = rb_entry(node, struct rxe_pool_entry, node);
+
+               cmp = memcmp((u8 *)elem + pool->key_offset,
+                            key, pool->key_size);
+
+               if (cmp > 0)
+                       node = node->rb_left;
+               else if (cmp < 0)
+                       node = node->rb_right;
+               else
+                       break;
+       }
+
+       if (node)
+               kref_get(&elem->ref_cnt);
+
+out:
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+       return node ? ((void *)elem) : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
new file mode 100644 (file)
index 0000000..4d04830
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_POOL_H
+#define RXE_POOL_H
+
+#define RXE_POOL_ALIGN         (16)
+#define RXE_POOL_CACHE_FLAGS   (0)
+
+enum rxe_pool_flags {
+       RXE_POOL_ATOMIC         = BIT(0),
+       RXE_POOL_INDEX          = BIT(1),
+       RXE_POOL_KEY            = BIT(2),
+};
+
+enum rxe_elem_type {
+       RXE_TYPE_UC,
+       RXE_TYPE_PD,
+       RXE_TYPE_AH,
+       RXE_TYPE_SRQ,
+       RXE_TYPE_QP,
+       RXE_TYPE_CQ,
+       RXE_TYPE_MR,
+       RXE_TYPE_MW,
+       RXE_TYPE_MC_GRP,
+       RXE_TYPE_MC_ELEM,
+       RXE_NUM_TYPES,          /* keep me last */
+};
+
+struct rxe_type_info {
+       char                    *name;
+       size_t                  size;
+       void                    (*cleanup)(void *obj);
+       enum rxe_pool_flags     flags;
+       u32                     max_index;
+       u32                     min_index;
+       size_t                  key_offset;
+       size_t                  key_size;
+       struct kmem_cache       *cache;
+};
+
+extern struct rxe_type_info rxe_type_info[];
+
+enum rxe_pool_state {
+       rxe_pool_invalid,
+       rxe_pool_valid,
+};
+
+struct rxe_pool_entry {
+       struct rxe_pool         *pool;
+       struct kref             ref_cnt;
+       struct list_head        list;
+
+       /* only used if indexed or keyed */
+       struct rb_node          node;
+       u32                     index;
+};
+
+struct rxe_pool {
+       struct rxe_dev          *rxe;
+       spinlock_t              pool_lock; /* pool spinlock */
+       size_t                  elem_size;
+       struct kref             ref_cnt;
+       void                    (*cleanup)(void *obj);
+       enum rxe_pool_state     state;
+       enum rxe_pool_flags     flags;
+       enum rxe_elem_type      type;
+
+       unsigned int            max_elem;
+       atomic_t                num_elem;
+
+       /* only used if indexed or keyed */
+       struct rb_root          tree;
+       unsigned long           *table;
+       size_t                  table_size;
+       u32                     max_index;
+       u32                     min_index;
+       u32                     last;
+       size_t                  key_offset;
+       size_t                  key_size;
+};
+
+/* initialize slab caches for managed objects */
+int rxe_cache_init(void);
+
+/* cleanup slab caches for managed objects */
+void rxe_cache_exit(void);
+
+/* initialize a pool of objects with given limit on
+ * number of elements. gets parameters from rxe_type_info
+ * pool elements will be allocated out of a slab cache
+ */
+int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
+                 enum rxe_elem_type type, u32 max_elem);
+
+/* free resources from object pool */
+int rxe_pool_cleanup(struct rxe_pool *pool);
+
+/* allocate an object from pool */
+void *rxe_alloc(struct rxe_pool *pool);
+
+/* assign an index to an indexed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_index(void *elem);
+
+/* drop an index and remove object from rb tree */
+void rxe_drop_index(void *elem);
+
+/* assign a key to a keyed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_key(void *elem, void *key);
+
+/* remove elem from rb tree */
+void rxe_drop_key(void *elem);
+
+/* lookup an indexed object from index. takes a reference on object */
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
+
+/* lookup keyed object from key. takes a reference on the object */
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key);
+
+/* cleanup an object when all references are dropped */
+void rxe_elem_release(struct kref *kref);
+
+/* take a reference on an object */
+#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt)
+
+/* drop a reference on an object */
+#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release)
+
+#endif /* RXE_POOL_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
new file mode 100644 (file)
index 0000000..22ba24f
--- /dev/null
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+char *rxe_qp_state_name[] = {
+       [QP_STATE_RESET]        = "RESET",
+       [QP_STATE_INIT]         = "INIT",
+       [QP_STATE_READY]        = "READY",
+       [QP_STATE_DRAIN]        = "DRAIN",
+       [QP_STATE_DRAINED]      = "DRAINED",
+       [QP_STATE_ERROR]        = "ERROR",
+};
+
+static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
+                         int has_srq)
+{
+       if (cap->max_send_wr > rxe->attr.max_qp_wr) {
+               pr_warn("invalid send wr = %d > %d\n",
+                       cap->max_send_wr, rxe->attr.max_qp_wr);
+               goto err1;
+       }
+
+       if (cap->max_send_sge > rxe->attr.max_sge) {
+               pr_warn("invalid send sge = %d > %d\n",
+                       cap->max_send_sge, rxe->attr.max_sge);
+               goto err1;
+       }
+
+       if (!has_srq) {
+               if (cap->max_recv_wr > rxe->attr.max_qp_wr) {
+                       pr_warn("invalid recv wr = %d > %d\n",
+                               cap->max_recv_wr, rxe->attr.max_qp_wr);
+                       goto err1;
+               }
+
+               if (cap->max_recv_sge > rxe->attr.max_sge) {
+                       pr_warn("invalid recv sge = %d > %d\n",
+                               cap->max_recv_sge, rxe->attr.max_sge);
+                       goto err1;
+               }
+       }
+
+       if (cap->max_inline_data > rxe->max_inline_data) {
+               pr_warn("invalid max inline data = %d > %d\n",
+                       cap->max_inline_data, rxe->max_inline_data);
+               goto err1;
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init)
+{
+       struct ib_qp_cap *cap = &init->cap;
+       struct rxe_port *port;
+       int port_num = init->port_num;
+
+       if (!init->recv_cq || !init->send_cq) {
+               pr_warn("missing cq\n");
+               goto err1;
+       }
+
+       if (rxe_qp_chk_cap(rxe, cap, !!init->srq))
+               goto err1;
+
+       if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) {
+               if (port_num != 1) {
+                       pr_warn("invalid port = %d\n", port_num);
+                       goto err1;
+               }
+
+               port = &rxe->port;
+
+               if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) {
+                       pr_warn("SMI QP exists for port %d\n", port_num);
+                       goto err1;
+               }
+
+               if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) {
+                       pr_warn("GSI QP exists for port %d\n", port_num);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n)
+{
+       qp->resp.res_head = 0;
+       qp->resp.res_tail = 0;
+       qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL);
+
+       if (!qp->resp.resources)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void free_rd_atomic_resources(struct rxe_qp *qp)
+{
+       if (qp->resp.resources) {
+               int i;
+
+               for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+                       struct resp_res *res = &qp->resp.resources[i];
+
+                       free_rd_atomic_resource(qp, res);
+               }
+               kfree(qp->resp.resources);
+               qp->resp.resources = NULL;
+       }
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
+{
+       if (res->type == RXE_ATOMIC_MASK) {
+               rxe_drop_ref(qp);
+               kfree_skb(res->atomic.skb);
+       } else if (res->type == RXE_READ_MASK) {
+               if (res->read.mr)
+                       rxe_drop_ref(res->read.mr);
+       }
+       res->type = 0;
+}
+
+static void cleanup_rd_atomic_resources(struct rxe_qp *qp)
+{
+       int i;
+       struct resp_res *res;
+
+       if (qp->resp.resources) {
+               for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+                       res = &qp->resp.resources[i];
+                       free_rd_atomic_resource(qp, res);
+               }
+       }
+}
+
+static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
+                            struct ib_qp_init_attr *init)
+{
+       struct rxe_port *port;
+       u32 qpn;
+
+       qp->sq_sig_type         = init->sq_sig_type;
+       qp->attr.path_mtu       = 1;
+       qp->mtu                 = ib_mtu_enum_to_int(qp->attr.path_mtu);
+
+       qpn                     = qp->pelem.index;
+       port                    = &rxe->port;
+
+       switch (init->qp_type) {
+       case IB_QPT_SMI:
+               qp->ibqp.qp_num         = 0;
+               port->qp_smi_index      = qpn;
+               qp->attr.port_num       = init->port_num;
+               break;
+
+       case IB_QPT_GSI:
+               qp->ibqp.qp_num         = 1;
+               port->qp_gsi_index      = qpn;
+               qp->attr.port_num       = init->port_num;
+               break;
+
+       default:
+               qp->ibqp.qp_num         = qpn;
+               break;
+       }
+
+       INIT_LIST_HEAD(&qp->grp_list);
+
+       skb_queue_head_init(&qp->send_pkts);
+
+       spin_lock_init(&qp->grp_lock);
+       spin_lock_init(&qp->state_lock);
+
+       atomic_set(&qp->ssn, 0);
+       atomic_set(&qp->skb_out, 0);
+}
+
+static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
+                          struct ib_qp_init_attr *init,
+                          struct ib_ucontext *context, struct ib_udata *udata)
+{
+       int err;
+       int wqe_size;
+
+       err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
+       if (err < 0)
+               return err;
+       qp->sk->sk->sk_user_data = qp;
+
+       qp->sq.max_wr           = init->cap.max_send_wr;
+       qp->sq.max_sge          = init->cap.max_send_sge;
+       qp->sq.max_inline       = init->cap.max_inline_data;
+
+       wqe_size = max_t(int, sizeof(struct rxe_send_wqe) +
+                        qp->sq.max_sge * sizeof(struct ib_sge),
+                        sizeof(struct rxe_send_wqe) +
+                        qp->sq.max_inline);
+
+       qp->sq.queue = rxe_queue_init(rxe,
+                                     &qp->sq.max_wr,
+                                     wqe_size);
+       if (!qp->sq.queue)
+               return -ENOMEM;
+
+       err = do_mmap_info(rxe, udata, true,
+                          context, qp->sq.queue->buf,
+                          qp->sq.queue->buf_size, &qp->sq.queue->ip);
+
+       if (err) {
+               kvfree(qp->sq.queue->buf);
+               kfree(qp->sq.queue);
+               return err;
+       }
+
+       qp->req.wqe_index       = producer_index(qp->sq.queue);
+       qp->req.state           = QP_STATE_RESET;
+       qp->req.opcode          = -1;
+       qp->comp.opcode         = -1;
+
+       spin_lock_init(&qp->sq.sq_lock);
+       skb_queue_head_init(&qp->req_pkts);
+
+       rxe_init_task(rxe, &qp->req.task, qp,
+                     rxe_requester, "req");
+       rxe_init_task(rxe, &qp->comp.task, qp,
+                     rxe_completer, "comp");
+
+       init_timer(&qp->rnr_nak_timer);
+       qp->rnr_nak_timer.function = rnr_nak_timer;
+       qp->rnr_nak_timer.data = (unsigned long)qp;
+
+       init_timer(&qp->retrans_timer);
+       qp->retrans_timer.function = retransmit_timer;
+       qp->retrans_timer.data = (unsigned long)qp;
+       qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
+
+       return 0;
+}
+
+static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
+                           struct ib_qp_init_attr *init,
+                           struct ib_ucontext *context, struct ib_udata *udata)
+{
+       int err;
+       int wqe_size;
+
+       if (!qp->srq) {
+               qp->rq.max_wr           = init->cap.max_recv_wr;
+               qp->rq.max_sge          = init->cap.max_recv_sge;
+
+               wqe_size = rcv_wqe_size(qp->rq.max_sge);
+
+               pr_debug("max_wr = %d, max_sge = %d, wqe_size = %d\n",
+                        qp->rq.max_wr, qp->rq.max_sge, wqe_size);
+
+               qp->rq.queue = rxe_queue_init(rxe,
+                                             &qp->rq.max_wr,
+                                             wqe_size);
+               if (!qp->rq.queue)
+                       return -ENOMEM;
+
+               err = do_mmap_info(rxe, udata, false, context,
+                                  qp->rq.queue->buf,
+                                  qp->rq.queue->buf_size,
+                                  &qp->rq.queue->ip);
+               if (err) {
+                       kvfree(qp->rq.queue->buf);
+                       kfree(qp->rq.queue);
+                       return err;
+               }
+       }
+
+       spin_lock_init(&qp->rq.producer_lock);
+       spin_lock_init(&qp->rq.consumer_lock);
+
+       skb_queue_head_init(&qp->resp_pkts);
+
+       rxe_init_task(rxe, &qp->resp.task, qp,
+                     rxe_responder, "resp");
+
+       qp->resp.opcode         = OPCODE_NONE;
+       qp->resp.msn            = 0;
+       qp->resp.state          = QP_STATE_RESET;
+
+       return 0;
+}
+
+/* called by the create qp verb */
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+                    struct ib_qp_init_attr *init, struct ib_udata *udata,
+                    struct ib_pd *ibpd)
+{
+       int err;
+       struct rxe_cq *rcq = to_rcq(init->recv_cq);
+       struct rxe_cq *scq = to_rcq(init->send_cq);
+       struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
+       struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+
+       rxe_add_ref(pd);
+       rxe_add_ref(rcq);
+       rxe_add_ref(scq);
+       if (srq)
+               rxe_add_ref(srq);
+
+       qp->pd                  = pd;
+       qp->rcq                 = rcq;
+       qp->scq                 = scq;
+       qp->srq                 = srq;
+
+       rxe_qp_init_misc(rxe, qp, init);
+
+       err = rxe_qp_init_req(rxe, qp, init, context, udata);
+       if (err)
+               goto err1;
+
+       err = rxe_qp_init_resp(rxe, qp, init, context, udata);
+       if (err)
+               goto err2;
+
+       qp->attr.qp_state = IB_QPS_RESET;
+       qp->valid = 1;
+
+       return 0;
+
+err2:
+       rxe_queue_cleanup(qp->sq.queue);
+err1:
+       if (srq)
+               rxe_drop_ref(srq);
+       rxe_drop_ref(scq);
+       rxe_drop_ref(rcq);
+       rxe_drop_ref(pd);
+
+       return err;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init)
+{
+       init->event_handler             = qp->ibqp.event_handler;
+       init->qp_context                = qp->ibqp.qp_context;
+       init->send_cq                   = qp->ibqp.send_cq;
+       init->recv_cq                   = qp->ibqp.recv_cq;
+       init->srq                       = qp->ibqp.srq;
+
+       init->cap.max_send_wr           = qp->sq.max_wr;
+       init->cap.max_send_sge          = qp->sq.max_sge;
+       init->cap.max_inline_data       = qp->sq.max_inline;
+
+       if (!qp->srq) {
+               init->cap.max_recv_wr           = qp->rq.max_wr;
+               init->cap.max_recv_sge          = qp->rq.max_sge;
+       }
+
+       init->sq_sig_type               = qp->sq_sig_type;
+
+       init->qp_type                   = qp->ibqp.qp_type;
+       init->port_num                  = 1;
+
+       return 0;
+}
+
+/* called by the modify qp verb, this routine checks all the parameters before
+ * making any changes
+ */
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+                   struct ib_qp_attr *attr, int mask)
+{
+       enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ?
+                                       attr->cur_qp_state : qp->attr.qp_state;
+       enum ib_qp_state new_state = (mask & IB_QP_STATE) ?
+                                       attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask,
+                               IB_LINK_LAYER_ETHERNET)) {
+               pr_warn("invalid mask or state for qp\n");
+               goto err1;
+       }
+
+       if (mask & IB_QP_STATE) {
+               if (cur_state == IB_QPS_SQD) {
+                       if (qp->req.state == QP_STATE_DRAIN &&
+                           new_state != IB_QPS_ERR)
+                               goto err1;
+               }
+       }
+
+       if (mask & IB_QP_PORT) {
+               if (attr->port_num != 1) {
+                       pr_warn("invalid port %d\n", attr->port_num);
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq))
+               goto err1;
+
+       if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr))
+               goto err1;
+
+       if (mask & IB_QP_ALT_PATH) {
+               if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr))
+                       goto err1;
+               if (attr->alt_port_num != 1) {
+                       pr_warn("invalid alt port %d\n", attr->alt_port_num);
+                       goto err1;
+               }
+               if (attr->alt_timeout > 31) {
+                       pr_warn("invalid QP alt timeout %d > 31\n",
+                               attr->alt_timeout);
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_PATH_MTU) {
+               struct rxe_port *port = &rxe->port;
+
+               enum ib_mtu max_mtu = port->attr.max_mtu;
+               enum ib_mtu mtu = attr->path_mtu;
+
+               if (mtu > max_mtu) {
+                       pr_debug("invalid mtu (%d) > (%d)\n",
+                                ib_mtu_enum_to_int(mtu),
+                                ib_mtu_enum_to_int(max_mtu));
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+               if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) {
+                       pr_warn("invalid max_rd_atomic %d > %d\n",
+                               attr->max_rd_atomic,
+                               rxe->attr.max_qp_rd_atom);
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_TIMEOUT) {
+               if (attr->timeout > 31) {
+                       pr_warn("invalid QP timeout %d > 31\n",
+                               attr->timeout);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+/* move the qp to the reset state */
+static void rxe_qp_reset(struct rxe_qp *qp)
+{
+       /* stop tasks from running */
+       rxe_disable_task(&qp->resp.task);
+
+       /* stop request/comp */
+       if (qp->sq.queue) {
+               if (qp_type(qp) == IB_QPT_RC)
+                       rxe_disable_task(&qp->comp.task);
+               rxe_disable_task(&qp->req.task);
+       }
+
+       /* move qp to the reset state */
+       qp->req.state = QP_STATE_RESET;
+       qp->resp.state = QP_STATE_RESET;
+
+       /* let state machines reset themselves drain work and packet queues
+        * etc.
+        */
+       __rxe_do_task(&qp->resp.task);
+
+       if (qp->sq.queue) {
+               __rxe_do_task(&qp->comp.task);
+               __rxe_do_task(&qp->req.task);
+       }
+
+       /* cleanup attributes */
+       atomic_set(&qp->ssn, 0);
+       qp->req.opcode = -1;
+       qp->req.need_retry = 0;
+       qp->req.noack_pkts = 0;
+       qp->resp.msn = 0;
+       qp->resp.opcode = -1;
+       qp->resp.drop_msg = 0;
+       qp->resp.goto_error = 0;
+       qp->resp.sent_psn_nak = 0;
+
+       if (qp->resp.mr) {
+               rxe_drop_ref(qp->resp.mr);
+               qp->resp.mr = NULL;
+       }
+
+       cleanup_rd_atomic_resources(qp);
+
+       /* reenable tasks */
+       rxe_enable_task(&qp->resp.task);
+
+       if (qp->sq.queue) {
+               if (qp_type(qp) == IB_QPT_RC)
+                       rxe_enable_task(&qp->comp.task);
+
+               rxe_enable_task(&qp->req.task);
+       }
+}
+
+/* drain the send queue */
+static void rxe_qp_drain(struct rxe_qp *qp)
+{
+       if (qp->sq.queue) {
+               if (qp->req.state != QP_STATE_DRAINED) {
+                       qp->req.state = QP_STATE_DRAIN;
+                       if (qp_type(qp) == IB_QPT_RC)
+                               rxe_run_task(&qp->comp.task, 1);
+                       else
+                               __rxe_do_task(&qp->comp.task);
+                       rxe_run_task(&qp->req.task, 1);
+               }
+       }
+}
+
+/* move the qp to the error state */
+void rxe_qp_error(struct rxe_qp *qp)
+{
+       qp->req.state = QP_STATE_ERROR;
+       qp->resp.state = QP_STATE_ERROR;
+
+       /* drain work and packet queues */
+       rxe_run_task(&qp->resp.task, 1);
+
+       if (qp_type(qp) == IB_QPT_RC)
+               rxe_run_task(&qp->comp.task, 1);
+       else
+               __rxe_do_task(&qp->comp.task);
+       rxe_run_task(&qp->req.task, 1);
+}
+
+/* called by the modify qp verb */
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
+                    struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       union ib_gid sgid;
+       struct ib_gid_attr sgid_attr;
+
+       if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+               int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic);
+
+               free_rd_atomic_resources(qp);
+
+               err = alloc_rd_atomic_resources(qp, max_rd_atomic);
+               if (err)
+                       return err;
+
+               qp->attr.max_rd_atomic = max_rd_atomic;
+               atomic_set(&qp->req.rd_atomic, max_rd_atomic);
+       }
+
+       if (mask & IB_QP_CUR_STATE)
+               qp->attr.cur_qp_state = attr->qp_state;
+
+       if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+               qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify;
+
+       if (mask & IB_QP_ACCESS_FLAGS)
+               qp->attr.qp_access_flags = attr->qp_access_flags;
+
+       if (mask & IB_QP_PKEY_INDEX)
+               qp->attr.pkey_index = attr->pkey_index;
+
+       if (mask & IB_QP_PORT)
+               qp->attr.port_num = attr->port_num;
+
+       if (mask & IB_QP_QKEY)
+               qp->attr.qkey = attr->qkey;
+
+       if (mask & IB_QP_AV) {
+               ib_get_cached_gid(&rxe->ib_dev, 1,
+                                 attr->ah_attr.grh.sgid_index, &sgid,
+                                 &sgid_attr);
+               rxe_av_from_attr(rxe, attr->port_num, &qp->pri_av,
+                                &attr->ah_attr);
+               rxe_av_fill_ip_info(rxe, &qp->pri_av, &attr->ah_attr,
+                                   &sgid_attr, &sgid);
+               if (sgid_attr.ndev)
+                       dev_put(sgid_attr.ndev);
+       }
+
+       if (mask & IB_QP_ALT_PATH) {
+               ib_get_cached_gid(&rxe->ib_dev, 1,
+                                 attr->alt_ah_attr.grh.sgid_index, &sgid,
+                                 &sgid_attr);
+
+               rxe_av_from_attr(rxe, attr->alt_port_num, &qp->alt_av,
+                                &attr->alt_ah_attr);
+               rxe_av_fill_ip_info(rxe, &qp->alt_av, &attr->alt_ah_attr,
+                                   &sgid_attr, &sgid);
+               if (sgid_attr.ndev)
+                       dev_put(sgid_attr.ndev);
+
+               qp->attr.alt_port_num = attr->alt_port_num;
+               qp->attr.alt_pkey_index = attr->alt_pkey_index;
+               qp->attr.alt_timeout = attr->alt_timeout;
+       }
+
+       if (mask & IB_QP_PATH_MTU) {
+               qp->attr.path_mtu = attr->path_mtu;
+               qp->mtu = ib_mtu_enum_to_int(attr->path_mtu);
+       }
+
+       if (mask & IB_QP_TIMEOUT) {
+               qp->attr.timeout = attr->timeout;
+               if (attr->timeout == 0) {
+                       qp->qp_timeout_jiffies = 0;
+               } else {
+                       /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */
+                       int j = nsecs_to_jiffies(4096ULL << attr->timeout);
+
+                       qp->qp_timeout_jiffies = j ? j : 1;
+               }
+       }
+
+       if (mask & IB_QP_RETRY_CNT) {
+               qp->attr.retry_cnt = attr->retry_cnt;
+               qp->comp.retry_cnt = attr->retry_cnt;
+               pr_debug("set retry count = %d\n", attr->retry_cnt);
+       }
+
+       if (mask & IB_QP_RNR_RETRY) {
+               qp->attr.rnr_retry = attr->rnr_retry;
+               qp->comp.rnr_retry = attr->rnr_retry;
+               pr_debug("set rnr retry count = %d\n", attr->rnr_retry);
+       }
+
+       if (mask & IB_QP_RQ_PSN) {
+               qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK);
+               qp->resp.psn = qp->attr.rq_psn;
+               pr_debug("set resp psn = 0x%x\n", qp->resp.psn);
+       }
+
+       if (mask & IB_QP_MIN_RNR_TIMER) {
+               qp->attr.min_rnr_timer = attr->min_rnr_timer;
+               pr_debug("set min rnr timer = 0x%x\n",
+                        attr->min_rnr_timer);
+       }
+
+       if (mask & IB_QP_SQ_PSN) {
+               qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK);
+               qp->req.psn = qp->attr.sq_psn;
+               qp->comp.psn = qp->attr.sq_psn;
+               pr_debug("set req psn = 0x%x\n", qp->req.psn);
+       }
+
+       if (mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+               qp->attr.max_dest_rd_atomic =
+                       __roundup_pow_of_two(attr->max_dest_rd_atomic);
+       }
+
+       if (mask & IB_QP_PATH_MIG_STATE)
+               qp->attr.path_mig_state = attr->path_mig_state;
+
+       if (mask & IB_QP_DEST_QPN)
+               qp->attr.dest_qp_num = attr->dest_qp_num;
+
+       if (mask & IB_QP_STATE) {
+               qp->attr.qp_state = attr->qp_state;
+
+               switch (attr->qp_state) {
+               case IB_QPS_RESET:
+                       pr_debug("qp state -> RESET\n");
+                       rxe_qp_reset(qp);
+                       break;
+
+               case IB_QPS_INIT:
+                       pr_debug("qp state -> INIT\n");
+                       qp->req.state = QP_STATE_INIT;
+                       qp->resp.state = QP_STATE_INIT;
+                       break;
+
+               case IB_QPS_RTR:
+                       pr_debug("qp state -> RTR\n");
+                       qp->resp.state = QP_STATE_READY;
+                       break;
+
+               case IB_QPS_RTS:
+                       pr_debug("qp state -> RTS\n");
+                       qp->req.state = QP_STATE_READY;
+                       break;
+
+               case IB_QPS_SQD:
+                       pr_debug("qp state -> SQD\n");
+                       rxe_qp_drain(qp);
+                       break;
+
+               case IB_QPS_SQE:
+                       pr_warn("qp state -> SQE !!?\n");
+                       /* Not possible from modify_qp. */
+                       break;
+
+               case IB_QPS_ERR:
+                       pr_debug("qp state -> ERR\n");
+                       rxe_qp_error(qp);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+       *attr = qp->attr;
+
+       attr->rq_psn                            = qp->resp.psn;
+       attr->sq_psn                            = qp->req.psn;
+
+       attr->cap.max_send_wr                   = qp->sq.max_wr;
+       attr->cap.max_send_sge                  = qp->sq.max_sge;
+       attr->cap.max_inline_data               = qp->sq.max_inline;
+
+       if (!qp->srq) {
+               attr->cap.max_recv_wr           = qp->rq.max_wr;
+               attr->cap.max_recv_sge          = qp->rq.max_sge;
+       }
+
+       rxe_av_to_attr(rxe, &qp->pri_av, &attr->ah_attr);
+       rxe_av_to_attr(rxe, &qp->alt_av, &attr->alt_ah_attr);
+
+       if (qp->req.state == QP_STATE_DRAIN) {
+               attr->sq_draining = 1;
+               /* applications that get this state
+                * typically spin on it. yield the
+                * processor
+                */
+               cond_resched();
+       } else {
+               attr->sq_draining = 0;
+       }
+
+       pr_debug("attr->sq_draining = %d\n", attr->sq_draining);
+
+       return 0;
+}
+
+/* called by the destroy qp verb */
+void rxe_qp_destroy(struct rxe_qp *qp)
+{
+       qp->valid = 0;
+       qp->qp_timeout_jiffies = 0;
+       rxe_cleanup_task(&qp->resp.task);
+
+       del_timer_sync(&qp->retrans_timer);
+       del_timer_sync(&qp->rnr_nak_timer);
+
+       rxe_cleanup_task(&qp->req.task);
+       if (qp_type(qp) == IB_QPT_RC)
+               rxe_cleanup_task(&qp->comp.task);
+
+       /* flush out any receive wr's or pending requests */
+       __rxe_do_task(&qp->req.task);
+       if (qp->sq.queue) {
+               __rxe_do_task(&qp->comp.task);
+               __rxe_do_task(&qp->req.task);
+       }
+}
+
+/* called when the last reference to the qp is dropped */
+void rxe_qp_cleanup(void *arg)
+{
+       struct rxe_qp *qp = arg;
+
+       rxe_drop_all_mcast_groups(qp);
+
+       if (qp->sq.queue)
+               rxe_queue_cleanup(qp->sq.queue);
+
+       if (qp->srq)
+               rxe_drop_ref(qp->srq);
+
+       if (qp->rq.queue)
+               rxe_queue_cleanup(qp->rq.queue);
+
+       if (qp->scq)
+               rxe_drop_ref(qp->scq);
+       if (qp->rcq)
+               rxe_drop_ref(qp->rcq);
+       if (qp->pd)
+               rxe_drop_ref(qp->pd);
+
+       if (qp->resp.mr) {
+               rxe_drop_ref(qp->resp.mr);
+               qp->resp.mr = NULL;
+       }
+
+       free_rd_atomic_resources(qp);
+
+       kernel_sock_shutdown(qp->sk, SHUT_RDWR);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c
new file mode 100644 (file)
index 0000000..0827425
--- /dev/null
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must retailuce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/vmalloc.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int do_mmap_info(struct rxe_dev *rxe,
+                struct ib_udata *udata,
+                bool is_req,
+                struct ib_ucontext *context,
+                struct rxe_queue_buf *buf,
+                size_t buf_size,
+                struct rxe_mmap_info **ip_p)
+{
+       int err;
+       u32 len, offset;
+       struct rxe_mmap_info *ip = NULL;
+
+       if (udata) {
+               if (is_req) {
+                       len = udata->outlen - sizeof(struct mminfo);
+                       offset = sizeof(struct mminfo);
+               } else {
+                       len = udata->outlen;
+                       offset = 0;
+               }
+
+               if (len < sizeof(ip->info))
+                       goto err1;
+
+               ip = rxe_create_mmap_info(rxe, buf_size, context, buf);
+               if (!ip)
+                       goto err1;
+
+               err = copy_to_user(udata->outbuf + offset, &ip->info,
+                                  sizeof(ip->info));
+               if (err)
+                       goto err2;
+
+               spin_lock_bh(&rxe->pending_lock);
+               list_add(&ip->pending_mmaps, &rxe->pending_mmaps);
+               spin_unlock_bh(&rxe->pending_lock);
+       }
+
+       *ip_p = ip;
+
+       return 0;
+
+err2:
+       kfree(ip);
+err1:
+       return -EINVAL;
+}
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+                                int *num_elem,
+                                unsigned int elem_size)
+{
+       struct rxe_queue *q;
+       size_t buf_size;
+       unsigned int num_slots;
+
+       /* num_elem == 0 is allowed, but uninteresting */
+       if (*num_elem < 0)
+               goto err1;
+
+       q = kmalloc(sizeof(*q), GFP_KERNEL);
+       if (!q)
+               goto err1;
+
+       q->rxe = rxe;
+
+       /* used in resize, only need to copy used part of queue */
+       q->elem_size = elem_size;
+
+       /* pad element up to at least a cacheline and always a power of 2 */
+       if (elem_size < cache_line_size())
+               elem_size = cache_line_size();
+       elem_size = roundup_pow_of_two(elem_size);
+
+       q->log2_elem_size = order_base_2(elem_size);
+
+       num_slots = *num_elem + 1;
+       num_slots = roundup_pow_of_two(num_slots);
+       q->index_mask = num_slots - 1;
+
+       buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size;
+
+       q->buf = vmalloc_user(buf_size);
+       if (!q->buf)
+               goto err2;
+
+       q->buf->log2_elem_size = q->log2_elem_size;
+       q->buf->index_mask = q->index_mask;
+
+       q->buf_size = buf_size;
+
+       *num_elem = num_slots - 1;
+       return q;
+
+err2:
+       kfree(q);
+err1:
+       return NULL;
+}
+
+/* copies elements from original q to new q and then swaps the contents of the
+ * two q headers. This is so that if anyone is holding a pointer to q it will
+ * still work
+ */
+static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q,
+                        unsigned int num_elem)
+{
+       if (!queue_empty(q) && (num_elem < queue_count(q)))
+               return -EINVAL;
+
+       while (!queue_empty(q)) {
+               memcpy(producer_addr(new_q), consumer_addr(q),
+                      new_q->elem_size);
+               advance_producer(new_q);
+               advance_consumer(q);
+       }
+
+       swap(*q, *new_q);
+
+       return 0;
+}
+
+int rxe_queue_resize(struct rxe_queue *q,
+                    unsigned int *num_elem_p,
+                    unsigned int elem_size,
+                    struct ib_ucontext *context,
+                    struct ib_udata *udata,
+                    spinlock_t *producer_lock,
+                    spinlock_t *consumer_lock)
+{
+       struct rxe_queue *new_q;
+       unsigned int num_elem = *num_elem_p;
+       int err;
+       unsigned long flags = 0, flags1;
+
+       new_q = rxe_queue_init(q->rxe, &num_elem, elem_size);
+       if (!new_q)
+               return -ENOMEM;
+
+       err = do_mmap_info(new_q->rxe, udata, false, context, new_q->buf,
+                          new_q->buf_size, &new_q->ip);
+       if (err) {
+               vfree(new_q->buf);
+               kfree(new_q);
+               goto err1;
+       }
+
+       spin_lock_irqsave(consumer_lock, flags1);
+
+       if (producer_lock) {
+               spin_lock_irqsave(producer_lock, flags);
+               err = resize_finish(q, new_q, num_elem);
+               spin_unlock_irqrestore(producer_lock, flags);
+       } else {
+               err = resize_finish(q, new_q, num_elem);
+       }
+
+       spin_unlock_irqrestore(consumer_lock, flags1);
+
+       rxe_queue_cleanup(new_q);       /* new/old dep on err */
+       if (err)
+               goto err1;
+
+       *num_elem_p = num_elem;
+       return 0;
+
+err1:
+       return err;
+}
+
+void rxe_queue_cleanup(struct rxe_queue *q)
+{
+       if (q->ip)
+               kref_put(&q->ip->ref, rxe_mmap_release);
+       else
+               vfree(q->buf);
+
+       kfree(q);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h
new file mode 100644 (file)
index 0000000..239fd60
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_QUEUE_H
+#define RXE_QUEUE_H
+
+/* implements a simple circular buffer that can optionally be
+ * shared between user space and the kernel and can be resized
+
+ * the requested element size is rounded up to a power of 2
+ * and the number of elements in the buffer is also rounded
+ * up to a power of 2. Since the queue is empty when the
+ * producer and consumer indices match the maximum capacity
+ * of the queue is one less than the number of element slots
+ */
+
+/* this data structure is shared between user space and kernel
+ * space for those cases where the queue is shared. It contains
+ * the producer and consumer indices. Is also contains a copy
+ * of the queue size parameters for user space to use but the
+ * kernel must use the parameters in the rxe_queue struct
+ * this MUST MATCH the corresponding librxe struct
+ * for performance reasons arrange to have producer and consumer
+ * pointers in separate cache lines
+ * the kernel should always mask the indices to avoid accessing
+ * memory outside of the data area
+ */
+struct rxe_queue_buf {
+       __u32                   log2_elem_size;
+       __u32                   index_mask;
+       __u32                   pad_1[30];
+       __u32                   producer_index;
+       __u32                   pad_2[31];
+       __u32                   consumer_index;
+       __u32                   pad_3[31];
+       __u8                    data[0];
+};
+
+struct rxe_queue {
+       struct rxe_dev          *rxe;
+       struct rxe_queue_buf    *buf;
+       struct rxe_mmap_info    *ip;
+       size_t                  buf_size;
+       size_t                  elem_size;
+       unsigned int            log2_elem_size;
+       unsigned int            index_mask;
+};
+
+int do_mmap_info(struct rxe_dev *rxe,
+                struct ib_udata *udata,
+                bool is_req,
+                struct ib_ucontext *context,
+                struct rxe_queue_buf *buf,
+                size_t buf_size,
+                struct rxe_mmap_info **ip_p);
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+                                int *num_elem,
+                                unsigned int elem_size);
+
+int rxe_queue_resize(struct rxe_queue *q,
+                    unsigned int *num_elem_p,
+                    unsigned int elem_size,
+                    struct ib_ucontext *context,
+                    struct ib_udata *udata,
+                    /* Protect producers while resizing queue */
+                    spinlock_t *producer_lock,
+                    /* Protect consumers while resizing queue */
+                    spinlock_t *consumer_lock);
+
+void rxe_queue_cleanup(struct rxe_queue *queue);
+
+static inline int next_index(struct rxe_queue *q, int index)
+{
+       return (index + 1) & q->buf->index_mask;
+}
+
+static inline int queue_empty(struct rxe_queue *q)
+{
+       return ((q->buf->producer_index - q->buf->consumer_index)
+                       & q->index_mask) == 0;
+}
+
+static inline int queue_full(struct rxe_queue *q)
+{
+       return ((q->buf->producer_index + 1 - q->buf->consumer_index)
+                       & q->index_mask) == 0;
+}
+
+static inline void advance_producer(struct rxe_queue *q)
+{
+       q->buf->producer_index = (q->buf->producer_index + 1)
+                       & q->index_mask;
+}
+
+static inline void advance_consumer(struct rxe_queue *q)
+{
+       q->buf->consumer_index = (q->buf->consumer_index + 1)
+                       & q->index_mask;
+}
+
+static inline void *producer_addr(struct rxe_queue *q)
+{
+       return q->buf->data + ((q->buf->producer_index & q->index_mask)
+                               << q->log2_elem_size);
+}
+
+static inline void *consumer_addr(struct rxe_queue *q)
+{
+       return q->buf->data + ((q->buf->consumer_index & q->index_mask)
+                               << q->log2_elem_size);
+}
+
+static inline unsigned int producer_index(struct rxe_queue *q)
+{
+       return q->buf->producer_index;
+}
+
+static inline unsigned int consumer_index(struct rxe_queue *q)
+{
+       return q->buf->consumer_index;
+}
+
+static inline void *addr_from_index(struct rxe_queue *q, unsigned int index)
+{
+       return q->buf->data + ((index & q->index_mask)
+                               << q->buf->log2_elem_size);
+}
+
+static inline unsigned int index_from_addr(const struct rxe_queue *q,
+                                          const void *addr)
+{
+       return (((u8 *)addr - q->buf->data) >> q->log2_elem_size)
+               & q->index_mask;
+}
+
+static inline unsigned int queue_count(const struct rxe_queue *q)
+{
+       return (q->buf->producer_index - q->buf->consumer_index)
+               & q->index_mask;
+}
+
+static inline void *queue_head(struct rxe_queue *q)
+{
+       return queue_empty(q) ? NULL : consumer_addr(q);
+}
+
+#endif /* RXE_QUEUE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
new file mode 100644 (file)
index 0000000..3d464c2
--- /dev/null
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                           struct rxe_qp *qp)
+{
+       if (unlikely(!qp->valid))
+               goto err1;
+
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               if (unlikely((pkt->opcode & IB_OPCODE_RC) != 0)) {
+                       pr_warn_ratelimited("bad qp type\n");
+                       goto err1;
+               }
+               break;
+       case IB_QPT_UC:
+               if (unlikely(!(pkt->opcode & IB_OPCODE_UC))) {
+                       pr_warn_ratelimited("bad qp type\n");
+                       goto err1;
+               }
+               break;
+       case IB_QPT_UD:
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               if (unlikely(!(pkt->opcode & IB_OPCODE_UD))) {
+                       pr_warn_ratelimited("bad qp type\n");
+                       goto err1;
+               }
+               break;
+       default:
+               pr_warn_ratelimited("unsupported qp type\n");
+               goto err1;
+       }
+
+       if (pkt->mask & RXE_REQ_MASK) {
+               if (unlikely(qp->resp.state != QP_STATE_READY))
+                       goto err1;
+       } else if (unlikely(qp->req.state < QP_STATE_READY ||
+                               qp->req.state > QP_STATE_DRAINED)) {
+               goto err1;
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static void set_bad_pkey_cntr(struct rxe_port *port)
+{
+       spin_lock_bh(&port->port_lock);
+       port->attr.bad_pkey_cntr = min((u32)0xffff,
+                                      port->attr.bad_pkey_cntr + 1);
+       spin_unlock_bh(&port->port_lock);
+}
+
+static void set_qkey_viol_cntr(struct rxe_port *port)
+{
+       spin_lock_bh(&port->port_lock);
+       port->attr.qkey_viol_cntr = min((u32)0xffff,
+                                       port->attr.qkey_viol_cntr + 1);
+       spin_unlock_bh(&port->port_lock);
+}
+
+static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                     u32 qpn, struct rxe_qp *qp)
+{
+       int i;
+       int found_pkey = 0;
+       struct rxe_port *port = &rxe->port;
+       u16 pkey = bth_pkey(pkt);
+
+       pkt->pkey_index = 0;
+
+       if (qpn == 1) {
+               for (i = 0; i < port->attr.pkey_tbl_len; i++) {
+                       if (pkey_match(pkey, port->pkey_tbl[i])) {
+                               pkt->pkey_index = i;
+                               found_pkey = 1;
+                               break;
+                       }
+               }
+
+               if (!found_pkey) {
+                       pr_warn_ratelimited("bad pkey = 0x%x\n", pkey);
+                       set_bad_pkey_cntr(port);
+                       goto err1;
+               }
+       } else if (qpn != 0) {
+               if (unlikely(!pkey_match(pkey,
+                                        port->pkey_tbl[qp->attr.pkey_index]
+                                       ))) {
+                       pr_warn_ratelimited("bad pkey = 0x%0x\n", pkey);
+                       set_bad_pkey_cntr(port);
+                       goto err1;
+               }
+               pkt->pkey_index = qp->attr.pkey_index;
+       }
+
+       if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) &&
+           qpn != 0 && pkt->mask) {
+               u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey;
+
+               if (unlikely(deth_qkey(pkt) != qkey)) {
+                       pr_warn_ratelimited("bad qkey, got 0x%x expected 0x%x for qpn 0x%x\n",
+                                           deth_qkey(pkt), qkey, qpn);
+                       set_qkey_viol_cntr(port);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                     struct rxe_qp *qp)
+{
+       struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+       if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC)
+               goto done;
+
+       if (unlikely(pkt->port_num != qp->attr.port_num)) {
+               pr_warn_ratelimited("port %d != qp port %d\n",
+                                   pkt->port_num, qp->attr.port_num);
+               goto err1;
+       }
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               struct in_addr *saddr =
+                       &qp->pri_av.sgid_addr._sockaddr_in.sin_addr;
+               struct in_addr *daddr =
+                       &qp->pri_av.dgid_addr._sockaddr_in.sin_addr;
+
+               if (ip_hdr(skb)->daddr != saddr->s_addr) {
+                       pr_warn_ratelimited("dst addr %pI4 != qp source addr %pI4\n",
+                                           &ip_hdr(skb)->daddr,
+                                           &saddr->s_addr);
+                       goto err1;
+               }
+
+               if (ip_hdr(skb)->saddr != daddr->s_addr) {
+                       pr_warn_ratelimited("source addr %pI4 != qp dst addr %pI4\n",
+                                           &ip_hdr(skb)->saddr,
+                                           &daddr->s_addr);
+                       goto err1;
+               }
+
+       } else if (skb->protocol == htons(ETH_P_IPV6)) {
+               struct in6_addr *saddr =
+                       &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr;
+               struct in6_addr *daddr =
+                       &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr;
+
+               if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr))) {
+                       pr_warn_ratelimited("dst addr %pI6 != qp source addr %pI6\n",
+                                           &ipv6_hdr(skb)->daddr, saddr);
+                       goto err1;
+               }
+
+               if (memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) {
+                       pr_warn_ratelimited("source addr %pI6 != qp dst addr %pI6\n",
+                                           &ipv6_hdr(skb)->saddr, daddr);
+                       goto err1;
+               }
+       }
+
+done:
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int hdr_check(struct rxe_pkt_info *pkt)
+{
+       struct rxe_dev *rxe = pkt->rxe;
+       struct rxe_port *port = &rxe->port;
+       struct rxe_qp *qp = NULL;
+       u32 qpn = bth_qpn(pkt);
+       int index;
+       int err;
+
+       if (unlikely(bth_tver(pkt) != BTH_TVER)) {
+               pr_warn_ratelimited("bad tver\n");
+               goto err1;
+       }
+
+       if (qpn != IB_MULTICAST_QPN) {
+               index = (qpn == 0) ? port->qp_smi_index :
+                       ((qpn == 1) ? port->qp_gsi_index : qpn);
+               qp = rxe_pool_get_index(&rxe->qp_pool, index);
+               if (unlikely(!qp)) {
+                       pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn);
+                       goto err1;
+               }
+
+               err = check_type_state(rxe, pkt, qp);
+               if (unlikely(err))
+                       goto err2;
+
+               err = check_addr(rxe, pkt, qp);
+               if (unlikely(err))
+                       goto err2;
+
+               err = check_keys(rxe, pkt, qpn, qp);
+               if (unlikely(err))
+                       goto err2;
+       } else {
+               if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) {
+                       pr_warn_ratelimited("no grh for mcast qpn\n");
+                       goto err1;
+               }
+       }
+
+       pkt->qp = qp;
+       return 0;
+
+err2:
+       if (qp)
+               rxe_drop_ref(qp);
+err1:
+       return -EINVAL;
+}
+
+static inline void rxe_rcv_pkt(struct rxe_dev *rxe,
+                              struct rxe_pkt_info *pkt,
+                              struct sk_buff *skb)
+{
+       if (pkt->mask & RXE_REQ_MASK)
+               rxe_resp_queue_pkt(rxe, pkt->qp, skb);
+       else
+               rxe_comp_queue_pkt(rxe, pkt->qp, skb);
+}
+
+static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+       struct rxe_mc_grp *mcg;
+       struct sk_buff *skb_copy;
+       struct rxe_mc_elem *mce;
+       struct rxe_qp *qp;
+       union ib_gid dgid;
+       int err;
+
+       if (skb->protocol == htons(ETH_P_IP))
+               ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+                                      (struct in6_addr *)&dgid);
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid));
+
+       /* lookup mcast group corresponding to mgid, takes a ref */
+       mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid);
+       if (!mcg)
+               goto err1;      /* mcast group not registered */
+
+       spin_lock_bh(&mcg->mcg_lock);
+
+       list_for_each_entry(mce, &mcg->qp_list, qp_list) {
+               qp = mce->qp;
+               pkt = SKB_TO_PKT(skb);
+
+               /* validate qp for incoming packet */
+               err = check_type_state(rxe, pkt, qp);
+               if (err)
+                       continue;
+
+               err = check_keys(rxe, pkt, bth_qpn(pkt), qp);
+               if (err)
+                       continue;
+
+               /* if *not* the last qp in the list
+                * make a copy of the skb to post to the next qp
+                */
+               skb_copy = (mce->qp_list.next != &mcg->qp_list) ?
+                               skb_clone(skb, GFP_KERNEL) : NULL;
+
+               pkt->qp = qp;
+               rxe_add_ref(qp);
+               rxe_rcv_pkt(rxe, pkt, skb);
+
+               skb = skb_copy;
+               if (!skb)
+                       break;
+       }
+
+       spin_unlock_bh(&mcg->mcg_lock);
+
+       rxe_drop_ref(mcg);      /* drop ref from rxe_pool_get_key. */
+
+err1:
+       if (skb)
+               kfree_skb(skb);
+}
+
+static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+       union ib_gid dgid;
+       union ib_gid *pdgid;
+       u16 index;
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+                                      (struct in6_addr *)&dgid);
+               pdgid = &dgid;
+       } else {
+               pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr;
+       }
+
+       return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid,
+                                         IB_GID_TYPE_ROCE_UDP_ENCAP,
+                                         1, rxe->ndev, &index);
+}
+
+/* rxe_rcv is called from the interface driver */
+int rxe_rcv(struct sk_buff *skb)
+{
+       int err;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+       struct rxe_dev *rxe = pkt->rxe;
+       __be32 *icrcp;
+       u32 calc_icrc, pack_icrc;
+
+       pkt->offset = 0;
+
+       if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES))
+               goto drop;
+
+       if (unlikely(rxe_match_dgid(rxe, skb) < 0)) {
+               pr_warn_ratelimited("failed matching dgid\n");
+               goto drop;
+       }
+
+       pkt->opcode = bth_opcode(pkt);
+       pkt->psn = bth_psn(pkt);
+       pkt->qp = NULL;
+       pkt->mask |= rxe_opcode[pkt->opcode].mask;
+
+       if (unlikely(skb->len < header_size(pkt)))
+               goto drop;
+
+       err = hdr_check(pkt);
+       if (unlikely(err))
+               goto drop;
+
+       /* Verify ICRC */
+       icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE);
+       pack_icrc = be32_to_cpu(*icrcp);
+
+       calc_icrc = rxe_icrc_hdr(pkt, skb);
+       calc_icrc = crc32_le(calc_icrc, (u8 *)payload_addr(pkt), payload_size(pkt));
+       calc_icrc = cpu_to_be32(~calc_icrc);
+       if (unlikely(calc_icrc != pack_icrc)) {
+               char saddr[sizeof(struct in6_addr)];
+
+               if (skb->protocol == htons(ETH_P_IPV6))
+                       sprintf(saddr, "%pI6", &ipv6_hdr(skb)->saddr);
+               else if (skb->protocol == htons(ETH_P_IP))
+                       sprintf(saddr, "%pI4", &ip_hdr(skb)->saddr);
+               else
+                       sprintf(saddr, "unknown");
+
+               pr_warn_ratelimited("bad ICRC from %s\n", saddr);
+               goto drop;
+       }
+
+       if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN))
+               rxe_rcv_mcast_pkt(rxe, skb);
+       else
+               rxe_rcv_pkt(rxe, pkt, skb);
+
+       return 0;
+
+drop:
+       if (pkt->qp)
+               rxe_drop_ref(pkt->qp);
+
+       kfree_skb(skb);
+       return 0;
+}
+EXPORT_SYMBOL(rxe_rcv);
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
new file mode 100644 (file)
index 0000000..33b2d9d
--- /dev/null
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                      unsigned opcode);
+
+static inline void retry_first_write_send(struct rxe_qp *qp,
+                                         struct rxe_send_wqe *wqe,
+                                         unsigned mask, int npsn)
+{
+       int i;
+
+       for (i = 0; i < npsn; i++) {
+               int to_send = (wqe->dma.resid > qp->mtu) ?
+                               qp->mtu : wqe->dma.resid;
+
+               qp->req.opcode = next_opcode(qp, wqe,
+                                            wqe->wr.opcode);
+
+               if (wqe->wr.send_flags & IB_SEND_INLINE) {
+                       wqe->dma.resid -= to_send;
+                       wqe->dma.sge_offset += to_send;
+               } else {
+                       advance_dma_data(&wqe->dma, to_send);
+               }
+               if (mask & WR_WRITE_MASK)
+                       wqe->iova += qp->mtu;
+       }
+}
+
+static void req_retry(struct rxe_qp *qp)
+{
+       struct rxe_send_wqe *wqe;
+       unsigned int wqe_index;
+       unsigned int mask;
+       int npsn;
+       int first = 1;
+
+       wqe = queue_head(qp->sq.queue);
+       npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK;
+
+       qp->req.wqe_index       = consumer_index(qp->sq.queue);
+       qp->req.psn             = qp->comp.psn;
+       qp->req.opcode          = -1;
+
+       for (wqe_index = consumer_index(qp->sq.queue);
+               wqe_index != producer_index(qp->sq.queue);
+               wqe_index = next_index(qp->sq.queue, wqe_index)) {
+               wqe = addr_from_index(qp->sq.queue, wqe_index);
+               mask = wr_opcode_mask(wqe->wr.opcode, qp);
+
+               if (wqe->state == wqe_state_posted)
+                       break;
+
+               if (wqe->state == wqe_state_done)
+                       continue;
+
+               wqe->iova = (mask & WR_ATOMIC_MASK) ?
+                            wqe->wr.wr.atomic.remote_addr :
+                            (mask & WR_READ_OR_WRITE_MASK) ?
+                            wqe->wr.wr.rdma.remote_addr :
+                            0;
+
+               if (!first || (mask & WR_READ_MASK) == 0) {
+                       wqe->dma.resid = wqe->dma.length;
+                       wqe->dma.cur_sge = 0;
+                       wqe->dma.sge_offset = 0;
+               }
+
+               if (first) {
+                       first = 0;
+
+                       if (mask & WR_WRITE_OR_SEND_MASK)
+                               retry_first_write_send(qp, wqe, mask, npsn);
+
+                       if (mask & WR_READ_MASK)
+                               wqe->iova += npsn * qp->mtu;
+               }
+
+               wqe->state = wqe_state_posted;
+       }
+}
+
+void rnr_nak_timer(unsigned long data)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)data;
+
+       pr_debug("rnr nak timer fired\n");
+       rxe_run_task(&qp->req.task, 1);
+}
+
+static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
+{
+       struct rxe_send_wqe *wqe = queue_head(qp->sq.queue);
+       unsigned long flags;
+
+       if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+               /* check to see if we are drained;
+                * state_lock used by requester and completer
+                */
+               spin_lock_irqsave(&qp->state_lock, flags);
+               do {
+                       if (qp->req.state != QP_STATE_DRAIN) {
+                               /* comp just finished */
+                               spin_unlock_irqrestore(&qp->state_lock,
+                                                      flags);
+                               break;
+                       }
+
+                       if (wqe && ((qp->req.wqe_index !=
+                               consumer_index(qp->sq.queue)) ||
+                               (wqe->state != wqe_state_posted))) {
+                               /* comp not done yet */
+                               spin_unlock_irqrestore(&qp->state_lock,
+                                                      flags);
+                               break;
+                       }
+
+                       qp->req.state = QP_STATE_DRAINED;
+                       spin_unlock_irqrestore(&qp->state_lock, flags);
+
+                       if (qp->ibqp.event_handler) {
+                               struct ib_event ev;
+
+                               ev.device = qp->ibqp.device;
+                               ev.element.qp = &qp->ibqp;
+                               ev.event = IB_EVENT_SQ_DRAINED;
+                               qp->ibqp.event_handler(&ev,
+                                       qp->ibqp.qp_context);
+                       }
+               } while (0);
+       }
+
+       if (qp->req.wqe_index == producer_index(qp->sq.queue))
+               return NULL;
+
+       wqe = addr_from_index(qp->sq.queue, qp->req.wqe_index);
+
+       if (unlikely((qp->req.state == QP_STATE_DRAIN ||
+                     qp->req.state == QP_STATE_DRAINED) &&
+                    (wqe->state != wqe_state_processing)))
+               return NULL;
+
+       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
+                    (qp->req.wqe_index != consumer_index(qp->sq.queue)))) {
+               qp->req.wait_fence = 1;
+               return NULL;
+       }
+
+       wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
+       return wqe;
+}
+
+static int next_opcode_rc(struct rxe_qp *qp, unsigned opcode, int fits)
+{
+       switch (opcode) {
+       case IB_WR_RDMA_WRITE:
+               if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_LAST :
+                               IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_ONLY :
+                               IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+       case IB_WR_SEND:
+               if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_SEND_LAST :
+                               IB_OPCODE_RC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_SEND_ONLY :
+                               IB_OPCODE_RC_SEND_FIRST;
+
+       case IB_WR_SEND_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_SEND_FIRST;
+
+       case IB_WR_RDMA_READ:
+               return IB_OPCODE_RC_RDMA_READ_REQUEST;
+
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+               return IB_OPCODE_RC_COMPARE_SWAP;
+
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+               return IB_OPCODE_RC_FETCH_ADD;
+
+       case IB_WR_SEND_WITH_INV:
+               if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+                       return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE :
+                               IB_OPCODE_RC_SEND_MIDDLE;
+               else
+                       return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
+                               IB_OPCODE_RC_SEND_FIRST;
+       case IB_WR_REG_MR:
+       case IB_WR_LOCAL_INV:
+               return opcode;
+       }
+
+       return -EINVAL;
+}
+
+static int next_opcode_uc(struct rxe_qp *qp, unsigned opcode, int fits)
+{
+       switch (opcode) {
+       case IB_WR_RDMA_WRITE:
+               if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_LAST :
+                               IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_ONLY :
+                               IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+       case IB_WR_SEND:
+               if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_SEND_LAST :
+                               IB_OPCODE_UC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_SEND_ONLY :
+                               IB_OPCODE_UC_SEND_FIRST;
+
+       case IB_WR_SEND_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_SEND_FIRST;
+       }
+
+       return -EINVAL;
+}
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                      unsigned opcode)
+{
+       int fits = (wqe->dma.resid <= qp->mtu);
+
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               return next_opcode_rc(qp, opcode, fits);
+
+       case IB_QPT_UC:
+               return next_opcode_uc(qp, opcode, fits);
+
+       case IB_QPT_SMI:
+       case IB_QPT_UD:
+       case IB_QPT_GSI:
+               switch (opcode) {
+               case IB_WR_SEND:
+                       return IB_OPCODE_UD_SEND_ONLY;
+
+               case IB_WR_SEND_WITH_IMM:
+                       return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+               }
+               break;
+
+       default:
+               break;
+       }
+
+       return -EINVAL;
+}
+
+static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+       int depth;
+
+       if (wqe->has_rd_atomic)
+               return 0;
+
+       qp->req.need_rd_atomic = 1;
+       depth = atomic_dec_return(&qp->req.rd_atomic);
+
+       if (depth >= 0) {
+               qp->req.need_rd_atomic = 0;
+               wqe->has_rd_atomic = 1;
+               return 0;
+       }
+
+       atomic_inc(&qp->req.rd_atomic);
+       return -EAGAIN;
+}
+
+static inline int get_mtu(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct rxe_port *port;
+       struct rxe_av *av;
+
+       if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))
+               return qp->mtu;
+
+       av = &wqe->av;
+       port = &rxe->port;
+
+       return port->mtu_cap;
+}
+
+static struct sk_buff *init_req_packet(struct rxe_qp *qp,
+                                      struct rxe_send_wqe *wqe,
+                                      int opcode, int payload,
+                                      struct rxe_pkt_info *pkt)
+{
+       struct rxe_dev          *rxe = to_rdev(qp->ibqp.device);
+       struct rxe_port         *port = &rxe->port;
+       struct sk_buff          *skb;
+       struct rxe_send_wr      *ibwr = &wqe->wr;
+       struct rxe_av           *av;
+       int                     pad = (-payload) & 0x3;
+       int                     paylen;
+       int                     solicited;
+       u16                     pkey;
+       u32                     qp_num;
+       int                     ack_req;
+
+       /* length from start of bth to end of icrc */
+       paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+       /* pkt->hdr, rxe, port_num and mask are initialized in ifc
+        * layer
+        */
+       pkt->opcode     = opcode;
+       pkt->qp         = qp;
+       pkt->psn        = qp->req.psn;
+       pkt->mask       = rxe_opcode[opcode].mask;
+       pkt->paylen     = paylen;
+       pkt->offset     = 0;
+       pkt->wqe        = wqe;
+
+       /* init skb */
+       av = rxe_get_av(pkt);
+       skb = rxe->ifc_ops->init_packet(rxe, av, paylen, pkt);
+       if (unlikely(!skb))
+               return NULL;
+
+       /* init bth */
+       solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
+                       (pkt->mask & RXE_END_MASK) &&
+                       ((pkt->mask & (RXE_SEND_MASK)) ||
+                       (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
+                       (RXE_WRITE_MASK | RXE_IMMDT_MASK));
+
+       pkey = (qp_type(qp) == IB_QPT_GSI) ?
+                port->pkey_tbl[ibwr->wr.ud.pkey_index] :
+                port->pkey_tbl[qp->attr.pkey_index];
+
+       qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn :
+                                        qp->attr.dest_qp_num;
+
+       ack_req = ((pkt->mask & RXE_END_MASK) ||
+               (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+       if (ack_req)
+               qp->req.noack_pkts = 0;
+
+       bth_init(pkt, pkt->opcode, solicited, 0, pad, pkey, qp_num,
+                ack_req, pkt->psn);
+
+       /* init optional headers */
+       if (pkt->mask & RXE_RETH_MASK) {
+               reth_set_rkey(pkt, ibwr->wr.rdma.rkey);
+               reth_set_va(pkt, wqe->iova);
+               reth_set_len(pkt, wqe->dma.length);
+       }
+
+       if (pkt->mask & RXE_IMMDT_MASK)
+               immdt_set_imm(pkt, ibwr->ex.imm_data);
+
+       if (pkt->mask & RXE_IETH_MASK)
+               ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey);
+
+       if (pkt->mask & RXE_ATMETH_MASK) {
+               atmeth_set_va(pkt, wqe->iova);
+               if (opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+                   opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+                       atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap);
+                       atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add);
+               } else {
+                       atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add);
+               }
+               atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey);
+       }
+
+       if (pkt->mask & RXE_DETH_MASK) {
+               if (qp->ibqp.qp_num == 1)
+                       deth_set_qkey(pkt, GSI_QKEY);
+               else
+                       deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey);
+               deth_set_sqp(pkt, qp->ibqp.qp_num);
+       }
+
+       return skb;
+}
+
+static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                      struct rxe_pkt_info *pkt, struct sk_buff *skb,
+                      int paylen)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       u32 crc = 0;
+       u32 *p;
+       int err;
+
+       err = rxe->ifc_ops->prepare(rxe, pkt, skb, &crc);
+       if (err)
+               return err;
+
+       if (pkt->mask & RXE_WRITE_OR_SEND) {
+               if (wqe->wr.send_flags & IB_SEND_INLINE) {
+                       u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
+
+                       crc = crc32_le(crc, tmp, paylen);
+
+                       memcpy(payload_addr(pkt), tmp, paylen);
+
+                       wqe->dma.resid -= paylen;
+                       wqe->dma.sge_offset += paylen;
+               } else {
+                       err = copy_data(rxe, qp->pd, 0, &wqe->dma,
+                                       payload_addr(pkt), paylen,
+                                       from_mem_obj,
+                                       &crc);
+                       if (err)
+                               return err;
+               }
+       }
+       p = payload_addr(pkt) + paylen + bth_pad(pkt);
+
+       *p = ~crc;
+
+       return 0;
+}
+
+static void update_wqe_state(struct rxe_qp *qp,
+                            struct rxe_send_wqe *wqe,
+                            struct rxe_pkt_info *pkt,
+                            enum wqe_state *prev_state)
+{
+       enum wqe_state prev_state_ = wqe->state;
+
+       if (pkt->mask & RXE_END_MASK) {
+               if (qp_type(qp) == IB_QPT_RC)
+                       wqe->state = wqe_state_pending;
+       } else {
+               wqe->state = wqe_state_processing;
+       }
+
+       *prev_state = prev_state_;
+}
+
+static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                        struct rxe_pkt_info *pkt, int payload)
+{
+       /* number of packets left to send including current one */
+       int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
+
+       /* handle zero length packet case */
+       if (num_pkt == 0)
+               num_pkt = 1;
+
+       if (pkt->mask & RXE_START_MASK) {
+               wqe->first_psn = qp->req.psn;
+               wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK;
+       }
+
+       if (pkt->mask & RXE_READ_MASK)
+               qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK;
+       else
+               qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+
+       qp->req.opcode = pkt->opcode;
+
+
+       if (pkt->mask & RXE_END_MASK)
+               qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index);
+
+       qp->need_req_skb = 0;
+
+       if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer))
+               mod_timer(&qp->retrans_timer,
+                         jiffies + qp->qp_timeout_jiffies);
+}
+
+int rxe_requester(void *arg)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)arg;
+       struct rxe_pkt_info pkt;
+       struct sk_buff *skb;
+       struct rxe_send_wqe *wqe;
+       unsigned mask;
+       int payload;
+       int mtu;
+       int opcode;
+       int ret;
+       enum wqe_state prev_state;
+
+next_wqe:
+       if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
+               goto exit;
+
+       if (unlikely(qp->req.state == QP_STATE_RESET)) {
+               qp->req.wqe_index = consumer_index(qp->sq.queue);
+               qp->req.opcode = -1;
+               qp->req.need_rd_atomic = 0;
+               qp->req.wait_psn = 0;
+               qp->req.need_retry = 0;
+               goto exit;
+       }
+
+       if (unlikely(qp->req.need_retry)) {
+               req_retry(qp);
+               qp->req.need_retry = 0;
+       }
+
+       wqe = req_next_wqe(qp);
+       if (unlikely(!wqe))
+               goto exit;
+
+       if (wqe->mask & WR_REG_MASK) {
+               if (wqe->wr.opcode == IB_WR_LOCAL_INV) {
+                       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+                       struct rxe_mem *rmr;
+
+                       rmr = rxe_pool_get_index(&rxe->mr_pool,
+                                                wqe->wr.ex.invalidate_rkey >> 8);
+                       if (!rmr) {
+                               pr_err("No mr for key %#x\n", wqe->wr.ex.invalidate_rkey);
+                               wqe->state = wqe_state_error;
+                               wqe->status = IB_WC_MW_BIND_ERR;
+                               goto exit;
+                       }
+                       rmr->state = RXE_MEM_STATE_FREE;
+                       wqe->state = wqe_state_done;
+                       wqe->status = IB_WC_SUCCESS;
+               } else if (wqe->wr.opcode == IB_WR_REG_MR) {
+                       struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr);
+
+                       rmr->state = RXE_MEM_STATE_VALID;
+                       rmr->access = wqe->wr.wr.reg.access;
+                       rmr->lkey = wqe->wr.wr.reg.key;
+                       rmr->rkey = wqe->wr.wr.reg.key;
+                       wqe->state = wqe_state_done;
+                       wqe->status = IB_WC_SUCCESS;
+               } else {
+                       goto exit;
+               }
+               qp->req.wqe_index = next_index(qp->sq.queue,
+                                               qp->req.wqe_index);
+               goto next_wqe;
+       }
+
+       if (unlikely(qp_type(qp) == IB_QPT_RC &&
+                    qp->req.psn > (qp->comp.psn + RXE_MAX_UNACKED_PSNS))) {
+               qp->req.wait_psn = 1;
+               goto exit;
+       }
+
+       /* Limit the number of inflight SKBs per QP */
+       if (unlikely(atomic_read(&qp->skb_out) >
+                    RXE_INFLIGHT_SKBS_PER_QP_HIGH)) {
+               qp->need_req_skb = 1;
+               goto exit;
+       }
+
+       opcode = next_opcode(qp, wqe, wqe->wr.opcode);
+       if (unlikely(opcode < 0)) {
+               wqe->status = IB_WC_LOC_QP_OP_ERR;
+               goto exit;
+       }
+
+       mask = rxe_opcode[opcode].mask;
+       if (unlikely(mask & RXE_READ_OR_ATOMIC)) {
+               if (check_init_depth(qp, wqe))
+                       goto exit;
+       }
+
+       mtu = get_mtu(qp, wqe);
+       payload = (mask & RXE_WRITE_OR_SEND) ? wqe->dma.resid : 0;
+       if (payload > mtu) {
+               if (qp_type(qp) == IB_QPT_UD) {
+                       /* C10-93.1.1: If the total sum of all the buffer lengths specified for a
+                        * UD message exceeds the MTU of the port as returned by QueryHCA, the CI
+                        * shall not emit any packets for this message. Further, the CI shall not
+                        * generate an error due to this condition.
+                        */
+
+                       /* fake a successful UD send */
+                       wqe->first_psn = qp->req.psn;
+                       wqe->last_psn = qp->req.psn;
+                       qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+                       qp->req.opcode = IB_OPCODE_UD_SEND_ONLY;
+                       qp->req.wqe_index = next_index(qp->sq.queue,
+                                                      qp->req.wqe_index);
+                       wqe->state = wqe_state_done;
+                       wqe->status = IB_WC_SUCCESS;
+                       goto complete;
+               }
+               payload = mtu;
+       }
+
+       skb = init_req_packet(qp, wqe, opcode, payload, &pkt);
+       if (unlikely(!skb)) {
+               pr_err("Failed allocating skb\n");
+               goto err;
+       }
+
+       if (fill_packet(qp, wqe, &pkt, skb, payload)) {
+               pr_debug("Error during fill packet\n");
+               goto err;
+       }
+
+       update_wqe_state(qp, wqe, &pkt, &prev_state);
+       ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
+       if (ret) {
+               qp->need_req_skb = 1;
+               kfree_skb(skb);
+
+               wqe->state = prev_state;
+
+               if (ret == -EAGAIN) {
+                       rxe_run_task(&qp->req.task, 1);
+                       goto exit;
+               }
+
+               goto err;
+       }
+
+       update_state(qp, wqe, &pkt, payload);
+
+       goto next_wqe;
+
+err:
+       kfree_skb(skb);
+       wqe->status = IB_WC_LOC_PROT_ERR;
+       wqe->state = wqe_state_error;
+
+complete:
+       if (qp_type(qp) != IB_QPT_RC) {
+               while (rxe_completer(qp) == 0)
+                       ;
+       }
+
+       return 0;
+
+exit:
+       return -EAGAIN;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
new file mode 100644 (file)
index 0000000..ebb03b4
--- /dev/null
@@ -0,0 +1,1380 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+enum resp_states {
+       RESPST_NONE,
+       RESPST_GET_REQ,
+       RESPST_CHK_PSN,
+       RESPST_CHK_OP_SEQ,
+       RESPST_CHK_OP_VALID,
+       RESPST_CHK_RESOURCE,
+       RESPST_CHK_LENGTH,
+       RESPST_CHK_RKEY,
+       RESPST_EXECUTE,
+       RESPST_READ_REPLY,
+       RESPST_COMPLETE,
+       RESPST_ACKNOWLEDGE,
+       RESPST_CLEANUP,
+       RESPST_DUPLICATE_REQUEST,
+       RESPST_ERR_MALFORMED_WQE,
+       RESPST_ERR_UNSUPPORTED_OPCODE,
+       RESPST_ERR_MISALIGNED_ATOMIC,
+       RESPST_ERR_PSN_OUT_OF_SEQ,
+       RESPST_ERR_MISSING_OPCODE_FIRST,
+       RESPST_ERR_MISSING_OPCODE_LAST_C,
+       RESPST_ERR_MISSING_OPCODE_LAST_D1E,
+       RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
+       RESPST_ERR_RNR,
+       RESPST_ERR_RKEY_VIOLATION,
+       RESPST_ERR_LENGTH,
+       RESPST_ERR_CQ_OVERFLOW,
+       RESPST_ERROR,
+       RESPST_RESET,
+       RESPST_DONE,
+       RESPST_EXIT,
+};
+
+static char *resp_state_name[] = {
+       [RESPST_NONE]                           = "NONE",
+       [RESPST_GET_REQ]                        = "GET_REQ",
+       [RESPST_CHK_PSN]                        = "CHK_PSN",
+       [RESPST_CHK_OP_SEQ]                     = "CHK_OP_SEQ",
+       [RESPST_CHK_OP_VALID]                   = "CHK_OP_VALID",
+       [RESPST_CHK_RESOURCE]                   = "CHK_RESOURCE",
+       [RESPST_CHK_LENGTH]                     = "CHK_LENGTH",
+       [RESPST_CHK_RKEY]                       = "CHK_RKEY",
+       [RESPST_EXECUTE]                        = "EXECUTE",
+       [RESPST_READ_REPLY]                     = "READ_REPLY",
+       [RESPST_COMPLETE]                       = "COMPLETE",
+       [RESPST_ACKNOWLEDGE]                    = "ACKNOWLEDGE",
+       [RESPST_CLEANUP]                        = "CLEANUP",
+       [RESPST_DUPLICATE_REQUEST]              = "DUPLICATE_REQUEST",
+       [RESPST_ERR_MALFORMED_WQE]              = "ERR_MALFORMED_WQE",
+       [RESPST_ERR_UNSUPPORTED_OPCODE]         = "ERR_UNSUPPORTED_OPCODE",
+       [RESPST_ERR_MISALIGNED_ATOMIC]          = "ERR_MISALIGNED_ATOMIC",
+       [RESPST_ERR_PSN_OUT_OF_SEQ]             = "ERR_PSN_OUT_OF_SEQ",
+       [RESPST_ERR_MISSING_OPCODE_FIRST]       = "ERR_MISSING_OPCODE_FIRST",
+       [RESPST_ERR_MISSING_OPCODE_LAST_C]      = "ERR_MISSING_OPCODE_LAST_C",
+       [RESPST_ERR_MISSING_OPCODE_LAST_D1E]    = "ERR_MISSING_OPCODE_LAST_D1E",
+       [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ]      = "ERR_TOO_MANY_RDMA_ATM_REQ",
+       [RESPST_ERR_RNR]                        = "ERR_RNR",
+       [RESPST_ERR_RKEY_VIOLATION]             = "ERR_RKEY_VIOLATION",
+       [RESPST_ERR_LENGTH]                     = "ERR_LENGTH",
+       [RESPST_ERR_CQ_OVERFLOW]                = "ERR_CQ_OVERFLOW",
+       [RESPST_ERROR]                          = "ERROR",
+       [RESPST_RESET]                          = "RESET",
+       [RESPST_DONE]                           = "DONE",
+       [RESPST_EXIT]                           = "EXIT",
+};
+
+/* rxe_recv calls here to add a request packet to the input queue */
+void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+                       struct sk_buff *skb)
+{
+       int must_sched;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       skb_queue_tail(&qp->req_pkts, skb);
+
+       must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) ||
+                       (skb_queue_len(&qp->req_pkts) > 1);
+
+       rxe_run_task(&qp->resp.task, must_sched);
+}
+
+static inline enum resp_states get_req(struct rxe_qp *qp,
+                                      struct rxe_pkt_info **pkt_p)
+{
+       struct sk_buff *skb;
+
+       if (qp->resp.state == QP_STATE_ERROR) {
+               skb = skb_dequeue(&qp->req_pkts);
+               if (skb) {
+                       /* drain request packet queue */
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+                       return RESPST_GET_REQ;
+               }
+
+               /* go drain recv wr queue */
+               return RESPST_CHK_RESOURCE;
+       }
+
+       skb = skb_peek(&qp->req_pkts);
+       if (!skb)
+               return RESPST_EXIT;
+
+       *pkt_p = SKB_TO_PKT(skb);
+
+       return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN;
+}
+
+static enum resp_states check_psn(struct rxe_qp *qp,
+                                 struct rxe_pkt_info *pkt)
+{
+       int diff = psn_compare(pkt->psn, qp->resp.psn);
+
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               if (diff > 0) {
+                       if (qp->resp.sent_psn_nak)
+                               return RESPST_CLEANUP;
+
+                       qp->resp.sent_psn_nak = 1;
+                       return RESPST_ERR_PSN_OUT_OF_SEQ;
+
+               } else if (diff < 0) {
+                       return RESPST_DUPLICATE_REQUEST;
+               }
+
+               if (qp->resp.sent_psn_nak)
+                       qp->resp.sent_psn_nak = 0;
+
+               break;
+
+       case IB_QPT_UC:
+               if (qp->resp.drop_msg || diff != 0) {
+                       if (pkt->mask & RXE_START_MASK) {
+                               qp->resp.drop_msg = 0;
+                               return RESPST_CHK_OP_SEQ;
+                       }
+
+                       qp->resp.drop_msg = 1;
+                       return RESPST_CLEANUP;
+               }
+               break;
+       default:
+               break;
+       }
+
+       return RESPST_CHK_OP_SEQ;
+}
+
+static enum resp_states check_op_seq(struct rxe_qp *qp,
+                                    struct rxe_pkt_info *pkt)
+{
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               switch (qp->resp.opcode) {
+               case IB_OPCODE_RC_SEND_FIRST:
+               case IB_OPCODE_RC_SEND_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_RC_SEND_MIDDLE:
+                       case IB_OPCODE_RC_SEND_LAST:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_C;
+                       }
+
+               case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+               case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_C;
+                       }
+
+               default:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_RC_SEND_MIDDLE:
+                       case IB_OPCODE_RC_SEND_LAST:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+                       case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               return RESPST_ERR_MISSING_OPCODE_FIRST;
+                       default:
+                               return RESPST_CHK_OP_VALID;
+                       }
+               }
+               break;
+
+       case IB_QPT_UC:
+               switch (qp->resp.opcode) {
+               case IB_OPCODE_UC_SEND_FIRST:
+               case IB_OPCODE_UC_SEND_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_UC_SEND_MIDDLE:
+                       case IB_OPCODE_UC_SEND_LAST:
+                       case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+                       }
+
+               case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+               case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+                       }
+
+               default:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_UC_SEND_MIDDLE:
+                       case IB_OPCODE_UC_SEND_LAST:
+                       case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+                       case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               qp->resp.drop_msg = 1;
+                               return RESPST_CLEANUP;
+                       default:
+                               return RESPST_CHK_OP_VALID;
+                       }
+               }
+               break;
+
+       default:
+               return RESPST_CHK_OP_VALID;
+       }
+}
+
+static enum resp_states check_op_valid(struct rxe_qp *qp,
+                                      struct rxe_pkt_info *pkt)
+{
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               if (((pkt->mask & RXE_READ_MASK) &&
+                    !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
+                   ((pkt->mask & RXE_WRITE_MASK) &&
+                    !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
+                   ((pkt->mask & RXE_ATOMIC_MASK) &&
+                    !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
+                       return RESPST_ERR_UNSUPPORTED_OPCODE;
+               }
+
+               break;
+
+       case IB_QPT_UC:
+               if ((pkt->mask & RXE_WRITE_MASK) &&
+                   !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) {
+                       qp->resp.drop_msg = 1;
+                       return RESPST_CLEANUP;
+               }
+
+               break;
+
+       case IB_QPT_UD:
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               break;
+
+       default:
+               WARN_ON(1);
+               break;
+       }
+
+       return RESPST_CHK_RESOURCE;
+}
+
+static enum resp_states get_srq_wqe(struct rxe_qp *qp)
+{
+       struct rxe_srq *srq = qp->srq;
+       struct rxe_queue *q = srq->rq.queue;
+       struct rxe_recv_wqe *wqe;
+       struct ib_event ev;
+
+       if (srq->error)
+               return RESPST_ERR_RNR;
+
+       spin_lock_bh(&srq->rq.consumer_lock);
+
+       wqe = queue_head(q);
+       if (!wqe) {
+               spin_unlock_bh(&srq->rq.consumer_lock);
+               return RESPST_ERR_RNR;
+       }
+
+       /* note kernel and user space recv wqes have same size */
+       memcpy(&qp->resp.srq_wqe, wqe, sizeof(qp->resp.srq_wqe));
+
+       qp->resp.wqe = &qp->resp.srq_wqe.wqe;
+       advance_consumer(q);
+
+       if (srq->limit && srq->ibsrq.event_handler &&
+           (queue_count(q) < srq->limit)) {
+               srq->limit = 0;
+               goto event;
+       }
+
+       spin_unlock_bh(&srq->rq.consumer_lock);
+       return RESPST_CHK_LENGTH;
+
+event:
+       spin_unlock_bh(&srq->rq.consumer_lock);
+       ev.device = qp->ibqp.device;
+       ev.element.srq = qp->ibqp.srq;
+       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+       srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
+       return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_resource(struct rxe_qp *qp,
+                                      struct rxe_pkt_info *pkt)
+{
+       struct rxe_srq *srq = qp->srq;
+
+       if (qp->resp.state == QP_STATE_ERROR) {
+               if (qp->resp.wqe) {
+                       qp->resp.status = IB_WC_WR_FLUSH_ERR;
+                       return RESPST_COMPLETE;
+               } else if (!srq) {
+                       qp->resp.wqe = queue_head(qp->rq.queue);
+                       if (qp->resp.wqe) {
+                               qp->resp.status = IB_WC_WR_FLUSH_ERR;
+                               return RESPST_COMPLETE;
+                       } else {
+                               return RESPST_EXIT;
+                       }
+               } else {
+                       return RESPST_EXIT;
+               }
+       }
+
+       if (pkt->mask & RXE_READ_OR_ATOMIC) {
+               /* it is the requesters job to not send
+                * too many read/atomic ops, we just
+                * recycle the responder resource queue
+                */
+               if (likely(qp->attr.max_rd_atomic > 0))
+                       return RESPST_CHK_LENGTH;
+               else
+                       return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ;
+       }
+
+       if (pkt->mask & RXE_RWR_MASK) {
+               if (srq)
+                       return get_srq_wqe(qp);
+
+               qp->resp.wqe = queue_head(qp->rq.queue);
+               return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR;
+       }
+
+       return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_length(struct rxe_qp *qp,
+                                    struct rxe_pkt_info *pkt)
+{
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               return RESPST_CHK_RKEY;
+
+       case IB_QPT_UC:
+               return RESPST_CHK_RKEY;
+
+       default:
+               return RESPST_CHK_RKEY;
+       }
+}
+
+static enum resp_states check_rkey(struct rxe_qp *qp,
+                                  struct rxe_pkt_info *pkt)
+{
+       struct rxe_mem *mem;
+       u64 va;
+       u32 rkey;
+       u32 resid;
+       u32 pktlen;
+       int mtu = qp->mtu;
+       enum resp_states state;
+       int access;
+
+       if (pkt->mask & (RXE_READ_MASK | RXE_WRITE_MASK)) {
+               if (pkt->mask & RXE_RETH_MASK) {
+                       qp->resp.va = reth_va(pkt);
+                       qp->resp.rkey = reth_rkey(pkt);
+                       qp->resp.resid = reth_len(pkt);
+               }
+               access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
+                                                    : IB_ACCESS_REMOTE_WRITE;
+       } else if (pkt->mask & RXE_ATOMIC_MASK) {
+               qp->resp.va = atmeth_va(pkt);
+               qp->resp.rkey = atmeth_rkey(pkt);
+               qp->resp.resid = sizeof(u64);
+               access = IB_ACCESS_REMOTE_ATOMIC;
+       } else {
+               return RESPST_EXECUTE;
+       }
+
+       va      = qp->resp.va;
+       rkey    = qp->resp.rkey;
+       resid   = qp->resp.resid;
+       pktlen  = payload_size(pkt);
+
+       mem = lookup_mem(qp->pd, access, rkey, lookup_remote);
+       if (!mem) {
+               state = RESPST_ERR_RKEY_VIOLATION;
+               goto err1;
+       }
+
+       if (unlikely(mem->state == RXE_MEM_STATE_FREE)) {
+               state = RESPST_ERR_RKEY_VIOLATION;
+               goto err1;
+       }
+
+       if (mem_check_range(mem, va, resid)) {
+               state = RESPST_ERR_RKEY_VIOLATION;
+               goto err2;
+       }
+
+       if (pkt->mask & RXE_WRITE_MASK)  {
+               if (resid > mtu) {
+                       if (pktlen != mtu || bth_pad(pkt)) {
+                               state = RESPST_ERR_LENGTH;
+                               goto err2;
+                       }
+
+                       resid = mtu;
+               } else {
+                       if (pktlen != resid) {
+                               state = RESPST_ERR_LENGTH;
+                               goto err2;
+                       }
+                       if ((bth_pad(pkt) != (0x3 & (-resid)))) {
+                               /* This case may not be exactly that
+                                * but nothing else fits.
+                                */
+                               state = RESPST_ERR_LENGTH;
+                               goto err2;
+                       }
+               }
+       }
+
+       WARN_ON(qp->resp.mr);
+
+       qp->resp.mr = mem;
+       return RESPST_EXECUTE;
+
+err2:
+       rxe_drop_ref(mem);
+err1:
+       return state;
+}
+
+static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr,
+                                    int data_len)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+       err = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma,
+                       data_addr, data_len, to_mem_obj, NULL);
+       if (unlikely(err))
+               return (err == -ENOSPC) ? RESPST_ERR_LENGTH
+                                       : RESPST_ERR_MALFORMED_WQE;
+
+       return RESPST_NONE;
+}
+
+static enum resp_states write_data_in(struct rxe_qp *qp,
+                                     struct rxe_pkt_info *pkt)
+{
+       enum resp_states rc = RESPST_NONE;
+       int     err;
+       int data_len = payload_size(pkt);
+
+       err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt),
+                          data_len, to_mem_obj, NULL);
+       if (err) {
+               rc = RESPST_ERR_RKEY_VIOLATION;
+               goto out;
+       }
+
+       qp->resp.va += data_len;
+       qp->resp.resid -= data_len;
+
+out:
+       return rc;
+}
+
+/* Guarantee atomicity of atomic operations at the machine level. */
+static DEFINE_SPINLOCK(atomic_ops_lock);
+
+static enum resp_states process_atomic(struct rxe_qp *qp,
+                                      struct rxe_pkt_info *pkt)
+{
+       u64 iova = atmeth_va(pkt);
+       u64 *vaddr;
+       enum resp_states ret;
+       struct rxe_mem *mr = qp->resp.mr;
+
+       if (mr->state != RXE_MEM_STATE_VALID) {
+               ret = RESPST_ERR_RKEY_VIOLATION;
+               goto out;
+       }
+
+       vaddr = iova_to_vaddr(mr, iova, sizeof(u64));
+
+       /* check vaddr is 8 bytes aligned. */
+       if (!vaddr || (uintptr_t)vaddr & 7) {
+               ret = RESPST_ERR_MISALIGNED_ATOMIC;
+               goto out;
+       }
+
+       spin_lock_bh(&atomic_ops_lock);
+
+       qp->resp.atomic_orig = *vaddr;
+
+       if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+           pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+               if (*vaddr == atmeth_comp(pkt))
+                       *vaddr = atmeth_swap_add(pkt);
+       } else {
+               *vaddr += atmeth_swap_add(pkt);
+       }
+
+       spin_unlock_bh(&atomic_ops_lock);
+
+       ret = RESPST_NONE;
+out:
+       return ret;
+}
+
+static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
+                                         struct rxe_pkt_info *pkt,
+                                         struct rxe_pkt_info *ack,
+                                         int opcode,
+                                         int payload,
+                                         u32 psn,
+                                         u8 syndrome,
+                                         u32 *crcp)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct sk_buff *skb;
+       u32 crc = 0;
+       u32 *p;
+       int paylen;
+       int pad;
+       int err;
+
+       /*
+        * allocate packet
+        */
+       pad = (-payload) & 0x3;
+       paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+       skb = rxe->ifc_ops->init_packet(rxe, &qp->pri_av, paylen, ack);
+       if (!skb)
+               return NULL;
+
+       ack->qp = qp;
+       ack->opcode = opcode;
+       ack->mask = rxe_opcode[opcode].mask;
+       ack->offset = pkt->offset;
+       ack->paylen = paylen;
+
+       /* fill in bth using the request packet headers */
+       memcpy(ack->hdr, pkt->hdr, pkt->offset + RXE_BTH_BYTES);
+
+       bth_set_opcode(ack, opcode);
+       bth_set_qpn(ack, qp->attr.dest_qp_num);
+       bth_set_pad(ack, pad);
+       bth_set_se(ack, 0);
+       bth_set_psn(ack, psn);
+       bth_set_ack(ack, 0);
+       ack->psn = psn;
+
+       if (ack->mask & RXE_AETH_MASK) {
+               aeth_set_syn(ack, syndrome);
+               aeth_set_msn(ack, qp->resp.msn);
+       }
+
+       if (ack->mask & RXE_ATMACK_MASK)
+               atmack_set_orig(ack, qp->resp.atomic_orig);
+
+       err = rxe->ifc_ops->prepare(rxe, ack, skb, &crc);
+       if (err) {
+               kfree_skb(skb);
+               return NULL;
+       }
+
+       if (crcp) {
+               /* CRC computation will be continued by the caller */
+               *crcp = crc;
+       } else {
+               p = payload_addr(ack) + payload + bth_pad(ack);
+               *p = ~crc;
+       }
+
+       return skb;
+}
+
+/* RDMA read response. If res is not NULL, then we have a current RDMA request
+ * being processed or replayed.
+ */
+static enum resp_states read_reply(struct rxe_qp *qp,
+                                  struct rxe_pkt_info *req_pkt)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct rxe_pkt_info ack_pkt;
+       struct sk_buff *skb;
+       int mtu = qp->mtu;
+       enum resp_states state;
+       int payload;
+       int opcode;
+       int err;
+       struct resp_res *res = qp->resp.res;
+       u32 icrc;
+       u32 *p;
+
+       if (!res) {
+               /* This is the first time we process that request. Get a
+                * resource
+                */
+               res = &qp->resp.resources[qp->resp.res_head];
+
+               free_rd_atomic_resource(qp, res);
+               rxe_advance_resp_resource(qp);
+
+               res->type               = RXE_READ_MASK;
+
+               res->read.va            = qp->resp.va;
+               res->read.va_org        = qp->resp.va;
+
+               res->first_psn          = req_pkt->psn;
+               res->last_psn           = req_pkt->psn +
+                                         (reth_len(req_pkt) + mtu - 1) /
+                                         mtu - 1;
+               res->cur_psn            = req_pkt->psn;
+
+               res->read.resid         = qp->resp.resid;
+               res->read.length        = qp->resp.resid;
+               res->read.rkey          = qp->resp.rkey;
+
+               /* note res inherits the reference to mr from qp */
+               res->read.mr            = qp->resp.mr;
+               qp->resp.mr             = NULL;
+
+               qp->resp.res            = res;
+               res->state              = rdatm_res_state_new;
+       }
+
+       if (res->state == rdatm_res_state_new) {
+               if (res->read.resid <= mtu)
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
+               else
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
+       } else {
+               if (res->read.resid > mtu)
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
+               else
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+       }
+
+       res->state = rdatm_res_state_next;
+
+       payload = min_t(int, res->read.resid, mtu);
+
+       skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload,
+                                res->cur_psn, AETH_ACK_UNLIMITED, &icrc);
+       if (!skb)
+               return RESPST_ERR_RNR;
+
+       err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt),
+                          payload, from_mem_obj, &icrc);
+       if (err)
+               pr_err("Failed copying memory\n");
+
+       p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt);
+       *p = ~icrc;
+
+       err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+       if (err) {
+               pr_err("Failed sending RDMA reply.\n");
+               kfree_skb(skb);
+               return RESPST_ERR_RNR;
+       }
+
+       res->read.va += payload;
+       res->read.resid -= payload;
+       res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK;
+
+       if (res->read.resid > 0) {
+               state = RESPST_DONE;
+       } else {
+               qp->resp.res = NULL;
+               qp->resp.opcode = -1;
+               qp->resp.psn = res->cur_psn;
+               state = RESPST_CLEANUP;
+       }
+
+       return state;
+}
+
+/* Executes a new request. A retried request never reach that function (send
+ * and writes are discarded, and reads and atomics are retried elsewhere.
+ */
+static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+       enum resp_states err;
+
+       if (pkt->mask & RXE_SEND_MASK) {
+               if (qp_type(qp) == IB_QPT_UD ||
+                   qp_type(qp) == IB_QPT_SMI ||
+                   qp_type(qp) == IB_QPT_GSI) {
+                       union rdma_network_hdr hdr;
+                       struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+                       memset(&hdr, 0, sizeof(hdr));
+                       if (skb->protocol == htons(ETH_P_IP))
+                               memcpy(&hdr.roce4grh, ip_hdr(skb), sizeof(hdr.roce4grh));
+                       else if (skb->protocol == htons(ETH_P_IPV6))
+                               memcpy(&hdr.ibgrh, ipv6_hdr(skb), sizeof(hdr.ibgrh));
+
+                       err = send_data_in(qp, &hdr, sizeof(hdr));
+                       if (err)
+                               return err;
+               }
+               err = send_data_in(qp, payload_addr(pkt), payload_size(pkt));
+               if (err)
+                       return err;
+       } else if (pkt->mask & RXE_WRITE_MASK) {
+               err = write_data_in(qp, pkt);
+               if (err)
+                       return err;
+       } else if (pkt->mask & RXE_READ_MASK) {
+               /* For RDMA Read we can increment the msn now. See C9-148. */
+               qp->resp.msn++;
+               return RESPST_READ_REPLY;
+       } else if (pkt->mask & RXE_ATOMIC_MASK) {
+               err = process_atomic(qp, pkt);
+               if (err)
+                       return err;
+       } else
+               /* Unreachable */
+               WARN_ON(1);
+
+       /* We successfully processed this new request. */
+       qp->resp.msn++;
+
+       /* next expected psn, read handles this separately */
+       qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+       qp->resp.opcode = pkt->opcode;
+       qp->resp.status = IB_WC_SUCCESS;
+
+       if (pkt->mask & RXE_COMP_MASK)
+               return RESPST_COMPLETE;
+       else if (qp_type(qp) == IB_QPT_RC)
+               return RESPST_ACKNOWLEDGE;
+       else
+               return RESPST_CLEANUP;
+}
+
+static enum resp_states do_complete(struct rxe_qp *qp,
+                                   struct rxe_pkt_info *pkt)
+{
+       struct rxe_cqe cqe;
+       struct ib_wc *wc = &cqe.ibwc;
+       struct ib_uverbs_wc *uwc = &cqe.uibwc;
+       struct rxe_recv_wqe *wqe = qp->resp.wqe;
+
+       if (unlikely(!wqe))
+               return RESPST_CLEANUP;
+
+       memset(&cqe, 0, sizeof(cqe));
+
+       wc->wr_id               = wqe->wr_id;
+       wc->status              = qp->resp.status;
+       wc->qp                  = &qp->ibqp;
+
+       /* fields after status are not required for errors */
+       if (wc->status == IB_WC_SUCCESS) {
+               wc->opcode = (pkt->mask & RXE_IMMDT_MASK &&
+                               pkt->mask & RXE_WRITE_MASK) ?
+                                       IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
+               wc->vendor_err = 0;
+               wc->byte_len = wqe->dma.length - wqe->dma.resid;
+
+               /* fields after byte_len are different between kernel and user
+                * space
+                */
+               if (qp->rcq->is_user) {
+                       uwc->wc_flags = IB_WC_GRH;
+
+                       if (pkt->mask & RXE_IMMDT_MASK) {
+                               uwc->wc_flags |= IB_WC_WITH_IMM;
+                               uwc->ex.imm_data =
+                                       (__u32 __force)immdt_imm(pkt);
+                       }
+
+                       if (pkt->mask & RXE_IETH_MASK) {
+                               uwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+                               uwc->ex.invalidate_rkey = ieth_rkey(pkt);
+                       }
+
+                       uwc->qp_num             = qp->ibqp.qp_num;
+
+                       if (pkt->mask & RXE_DETH_MASK)
+                               uwc->src_qp = deth_sqp(pkt);
+
+                       uwc->port_num           = qp->attr.port_num;
+               } else {
+                       struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+                       wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE;
+                       if (skb->protocol == htons(ETH_P_IP))
+                               wc->network_hdr_type = RDMA_NETWORK_IPV4;
+                       else
+                               wc->network_hdr_type = RDMA_NETWORK_IPV6;
+
+                       if (pkt->mask & RXE_IMMDT_MASK) {
+                               wc->wc_flags |= IB_WC_WITH_IMM;
+                               wc->ex.imm_data = immdt_imm(pkt);
+                       }
+
+                       if (pkt->mask & RXE_IETH_MASK) {
+                               struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+                               struct rxe_mem *rmr;
+
+                               wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+                               wc->ex.invalidate_rkey = ieth_rkey(pkt);
+
+                               rmr = rxe_pool_get_index(&rxe->mr_pool,
+                                                        wc->ex.invalidate_rkey >> 8);
+                               if (unlikely(!rmr)) {
+                                       pr_err("Bad rkey %#x invalidation\n", wc->ex.invalidate_rkey);
+                                       return RESPST_ERROR;
+                               }
+                               rmr->state = RXE_MEM_STATE_FREE;
+                       }
+
+                       wc->qp                  = &qp->ibqp;
+
+                       if (pkt->mask & RXE_DETH_MASK)
+                               wc->src_qp = deth_sqp(pkt);
+
+                       wc->port_num            = qp->attr.port_num;
+               }
+       }
+
+       /* have copy for srq and reference for !srq */
+       if (!qp->srq)
+               advance_consumer(qp->rq.queue);
+
+       qp->resp.wqe = NULL;
+
+       if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1))
+               return RESPST_ERR_CQ_OVERFLOW;
+
+       if (qp->resp.state == QP_STATE_ERROR)
+               return RESPST_CHK_RESOURCE;
+
+       if (!pkt)
+               return RESPST_DONE;
+       else if (qp_type(qp) == IB_QPT_RC)
+               return RESPST_ACKNOWLEDGE;
+       else
+               return RESPST_CLEANUP;
+}
+
+static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+                   u8 syndrome, u32 psn)
+{
+       int err = 0;
+       struct rxe_pkt_info ack_pkt;
+       struct sk_buff *skb;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+       skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
+                                0, psn, syndrome, NULL);
+       if (!skb) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+       if (err) {
+               pr_err_ratelimited("Failed sending ack\n");
+               kfree_skb(skb);
+       }
+
+err1:
+       return err;
+}
+
+static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+                          u8 syndrome)
+{
+       int rc = 0;
+       struct rxe_pkt_info ack_pkt;
+       struct sk_buff *skb;
+       struct sk_buff *skb_copy;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct resp_res *res;
+
+       skb = prepare_ack_packet(qp, pkt, &ack_pkt,
+                                IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
+                                syndrome, NULL);
+       if (!skb) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       skb_copy = skb_clone(skb, GFP_ATOMIC);
+       if (skb_copy)
+               rxe_add_ref(qp); /* for the new SKB */
+       else {
+               pr_warn("Could not clone atomic response\n");
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       res = &qp->resp.resources[qp->resp.res_head];
+       free_rd_atomic_resource(qp, res);
+       rxe_advance_resp_resource(qp);
+
+       res->type = RXE_ATOMIC_MASK;
+       res->atomic.skb = skb;
+       res->first_psn = qp->resp.psn;
+       res->last_psn = qp->resp.psn;
+       res->cur_psn = qp->resp.psn;
+
+       rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb_copy);
+       if (rc) {
+               pr_err_ratelimited("Failed sending ack\n");
+               rxe_drop_ref(qp);
+               kfree_skb(skb_copy);
+       }
+
+out:
+       return rc;
+}
+
+static enum resp_states acknowledge(struct rxe_qp *qp,
+                                   struct rxe_pkt_info *pkt)
+{
+       if (qp_type(qp) != IB_QPT_RC)
+               return RESPST_CLEANUP;
+
+       if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
+               send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
+       else if (pkt->mask & RXE_ATOMIC_MASK)
+               send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED);
+       else if (bth_ack(pkt))
+               send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
+
+       return RESPST_CLEANUP;
+}
+
+static enum resp_states cleanup(struct rxe_qp *qp,
+                               struct rxe_pkt_info *pkt)
+{
+       struct sk_buff *skb;
+
+       if (pkt) {
+               skb = skb_dequeue(&qp->req_pkts);
+               rxe_drop_ref(qp);
+               kfree_skb(skb);
+       }
+
+       if (qp->resp.mr) {
+               rxe_drop_ref(qp->resp.mr);
+               qp->resp.mr = NULL;
+       }
+
+       return RESPST_DONE;
+}
+
+static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn)
+{
+       int i;
+
+       for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+               struct resp_res *res = &qp->resp.resources[i];
+
+               if (res->type == 0)
+                       continue;
+
+               if (psn_compare(psn, res->first_psn) >= 0 &&
+                   psn_compare(psn, res->last_psn) <= 0) {
+                       return res;
+               }
+       }
+
+       return NULL;
+}
+
+static enum resp_states duplicate_request(struct rxe_qp *qp,
+                                         struct rxe_pkt_info *pkt)
+{
+       enum resp_states rc;
+
+       if (pkt->mask & RXE_SEND_MASK ||
+           pkt->mask & RXE_WRITE_MASK) {
+               /* SEND. Ack again and cleanup. C9-105. */
+               if (bth_ack(pkt))
+                       send_ack(qp, pkt, AETH_ACK_UNLIMITED, qp->resp.psn - 1);
+               rc = RESPST_CLEANUP;
+               goto out;
+       } else if (pkt->mask & RXE_READ_MASK) {
+               struct resp_res *res;
+
+               res = find_resource(qp, pkt->psn);
+               if (!res) {
+                       /* Resource not found. Class D error.  Drop the
+                        * request.
+                        */
+                       rc = RESPST_CLEANUP;
+                       goto out;
+               } else {
+                       /* Ensure this new request is the same as the previous
+                        * one or a subset of it.
+                        */
+                       u64 iova = reth_va(pkt);
+                       u32 resid = reth_len(pkt);
+
+                       if (iova < res->read.va_org ||
+                           resid > res->read.length ||
+                           (iova + resid) > (res->read.va_org +
+                                             res->read.length)) {
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+
+                       if (reth_rkey(pkt) != res->read.rkey) {
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+
+                       res->cur_psn = pkt->psn;
+                       res->state = (pkt->psn == res->first_psn) ?
+                                       rdatm_res_state_new :
+                                       rdatm_res_state_replay;
+
+                       /* Reset the resource, except length. */
+                       res->read.va_org = iova;
+                       res->read.va = iova;
+                       res->read.resid = resid;
+
+                       /* Replay the RDMA read reply. */
+                       qp->resp.res = res;
+                       rc = RESPST_READ_REPLY;
+                       goto out;
+               }
+       } else {
+               struct resp_res *res;
+
+               /* Find the operation in our list of responder resources. */
+               res = find_resource(qp, pkt->psn);
+               if (res) {
+                       struct sk_buff *skb_copy;
+
+                       skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC);
+                       if (skb_copy) {
+                               rxe_add_ref(qp); /* for the new SKB */
+                       } else {
+                               pr_warn("Couldn't clone atomic resp\n");
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+                       bth_set_psn(SKB_TO_PKT(skb_copy),
+                                   qp->resp.psn - 1);
+                       /* Resend the result. */
+                       rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
+                                            pkt, skb_copy);
+                       if (rc) {
+                               pr_err("Failed resending result. This flow is not handled - skb ignored\n");
+                               kfree_skb(skb_copy);
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+               }
+
+               /* Resource not found. Class D error. Drop the request. */
+               rc = RESPST_CLEANUP;
+               goto out;
+       }
+out:
+       return rc;
+}
+
+/* Process a class A or C. Both are treated the same in this implementation. */
+static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome,
+                             enum ib_wc_status status)
+{
+       qp->resp.aeth_syndrome  = syndrome;
+       qp->resp.status         = status;
+
+       /* indicate that we should go through the ERROR state */
+       qp->resp.goto_error     = 1;
+}
+
+static enum resp_states do_class_d1e_error(struct rxe_qp *qp)
+{
+       /* UC */
+       if (qp->srq) {
+               /* Class E */
+               qp->resp.drop_msg = 1;
+               if (qp->resp.wqe) {
+                       qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+                       return RESPST_COMPLETE;
+               } else {
+                       return RESPST_CLEANUP;
+               }
+       } else {
+               /* Class D1. This packet may be the start of a
+                * new message and could be valid. The previous
+                * message is invalid and ignored. reset the
+                * recv wr to its original state
+                */
+               if (qp->resp.wqe) {
+                       qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length;
+                       qp->resp.wqe->dma.cur_sge = 0;
+                       qp->resp.wqe->dma.sge_offset = 0;
+                       qp->resp.opcode = -1;
+               }
+
+               if (qp->resp.mr) {
+                       rxe_drop_ref(qp->resp.mr);
+                       qp->resp.mr = NULL;
+               }
+
+               return RESPST_CLEANUP;
+       }
+}
+
+int rxe_responder(void *arg)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)arg;
+       enum resp_states state;
+       struct rxe_pkt_info *pkt = NULL;
+       int ret = 0;
+
+       qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
+
+       if (!qp->valid) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       switch (qp->resp.state) {
+       case QP_STATE_RESET:
+               state = RESPST_RESET;
+               break;
+
+       default:
+               state = RESPST_GET_REQ;
+               break;
+       }
+
+       while (1) {
+               pr_debug("state = %s\n", resp_state_name[state]);
+               switch (state) {
+               case RESPST_GET_REQ:
+                       state = get_req(qp, &pkt);
+                       break;
+               case RESPST_CHK_PSN:
+                       state = check_psn(qp, pkt);
+                       break;
+               case RESPST_CHK_OP_SEQ:
+                       state = check_op_seq(qp, pkt);
+                       break;
+               case RESPST_CHK_OP_VALID:
+                       state = check_op_valid(qp, pkt);
+                       break;
+               case RESPST_CHK_RESOURCE:
+                       state = check_resource(qp, pkt);
+                       break;
+               case RESPST_CHK_LENGTH:
+                       state = check_length(qp, pkt);
+                       break;
+               case RESPST_CHK_RKEY:
+                       state = check_rkey(qp, pkt);
+                       break;
+               case RESPST_EXECUTE:
+                       state = execute(qp, pkt);
+                       break;
+               case RESPST_COMPLETE:
+                       state = do_complete(qp, pkt);
+                       break;
+               case RESPST_READ_REPLY:
+                       state = read_reply(qp, pkt);
+                       break;
+               case RESPST_ACKNOWLEDGE:
+                       state = acknowledge(qp, pkt);
+                       break;
+               case RESPST_CLEANUP:
+                       state = cleanup(qp, pkt);
+                       break;
+               case RESPST_DUPLICATE_REQUEST:
+                       state = duplicate_request(qp, pkt);
+                       break;
+               case RESPST_ERR_PSN_OUT_OF_SEQ:
+                       /* RC only - Class B. Drop packet. */
+                       send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
+                       state = RESPST_CLEANUP;
+                       break;
+
+               case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ:
+               case RESPST_ERR_MISSING_OPCODE_FIRST:
+               case RESPST_ERR_MISSING_OPCODE_LAST_C:
+               case RESPST_ERR_UNSUPPORTED_OPCODE:
+               case RESPST_ERR_MISALIGNED_ATOMIC:
+                       /* RC Only - Class C. */
+                       do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+                                         IB_WC_REM_INV_REQ_ERR);
+                       state = RESPST_COMPLETE;
+                       break;
+
+               case RESPST_ERR_MISSING_OPCODE_LAST_D1E:
+                       state = do_class_d1e_error(qp);
+                       break;
+               case RESPST_ERR_RNR:
+                       if (qp_type(qp) == IB_QPT_RC) {
+                               /* RC - class B */
+                               send_ack(qp, pkt, AETH_RNR_NAK |
+                                        (~AETH_TYPE_MASK &
+                                        qp->attr.min_rnr_timer),
+                                        pkt->psn);
+                       } else {
+                               /* UD/UC - class D */
+                               qp->resp.drop_msg = 1;
+                       }
+                       state = RESPST_CLEANUP;
+                       break;
+
+               case RESPST_ERR_RKEY_VIOLATION:
+                       if (qp_type(qp) == IB_QPT_RC) {
+                               /* Class C */
+                               do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR,
+                                                 IB_WC_REM_ACCESS_ERR);
+                               state = RESPST_COMPLETE;
+                       } else {
+                               qp->resp.drop_msg = 1;
+                               if (qp->srq) {
+                                       /* UC/SRQ Class D */
+                                       qp->resp.status = IB_WC_REM_ACCESS_ERR;
+                                       state = RESPST_COMPLETE;
+                               } else {
+                                       /* UC/non-SRQ Class E. */
+                                       state = RESPST_CLEANUP;
+                               }
+                       }
+                       break;
+
+               case RESPST_ERR_LENGTH:
+                       if (qp_type(qp) == IB_QPT_RC) {
+                               /* Class C */
+                               do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+                                                 IB_WC_REM_INV_REQ_ERR);
+                               state = RESPST_COMPLETE;
+                       } else if (qp->srq) {
+                               /* UC/UD - class E */
+                               qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+                               state = RESPST_COMPLETE;
+                       } else {
+                               /* UC/UD - class D */
+                               qp->resp.drop_msg = 1;
+                               state = RESPST_CLEANUP;
+                       }
+                       break;
+
+               case RESPST_ERR_MALFORMED_WQE:
+                       /* All, Class A. */
+                       do_class_ac_error(qp, AETH_NAK_REM_OP_ERR,
+                                         IB_WC_LOC_QP_OP_ERR);
+                       state = RESPST_COMPLETE;
+                       break;
+
+               case RESPST_ERR_CQ_OVERFLOW:
+                       /* All - Class G */
+                       state = RESPST_ERROR;
+                       break;
+
+               case RESPST_DONE:
+                       if (qp->resp.goto_error) {
+                               state = RESPST_ERROR;
+                               break;
+                       }
+
+                       goto done;
+
+               case RESPST_EXIT:
+                       if (qp->resp.goto_error) {
+                               state = RESPST_ERROR;
+                               break;
+                       }
+
+                       goto exit;
+
+               case RESPST_RESET: {
+                       struct sk_buff *skb;
+
+                       while ((skb = skb_dequeue(&qp->req_pkts))) {
+                               rxe_drop_ref(qp);
+                               kfree_skb(skb);
+                       }
+
+                       while (!qp->srq && qp->rq.queue &&
+                              queue_head(qp->rq.queue))
+                               advance_consumer(qp->rq.queue);
+
+                       qp->resp.wqe = NULL;
+                       goto exit;
+               }
+
+               case RESPST_ERROR:
+                       qp->resp.goto_error = 0;
+                       pr_warn("qp#%d moved to error state\n", qp_num(qp));
+                       rxe_qp_error(qp);
+                       goto exit;
+
+               default:
+                       WARN_ON(1);
+               }
+       }
+
+exit:
+       ret = -EAGAIN;
+done:
+       return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
new file mode 100644 (file)
index 0000000..2a6e3cd
--- /dev/null
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                    struct ib_srq_attr *attr, enum ib_srq_attr_mask mask)
+{
+       if (srq && srq->error) {
+               pr_warn("srq in error state\n");
+               goto err1;
+       }
+
+       if (mask & IB_SRQ_MAX_WR) {
+               if (attr->max_wr > rxe->attr.max_srq_wr) {
+                       pr_warn("max_wr(%d) > max_srq_wr(%d)\n",
+                               attr->max_wr, rxe->attr.max_srq_wr);
+                       goto err1;
+               }
+
+               if (attr->max_wr <= 0) {
+                       pr_warn("max_wr(%d) <= 0\n", attr->max_wr);
+                       goto err1;
+               }
+
+               if (srq && srq->limit && (attr->max_wr < srq->limit)) {
+                       pr_warn("max_wr (%d) < srq->limit (%d)\n",
+                               attr->max_wr, srq->limit);
+                       goto err1;
+               }
+
+               if (attr->max_wr < RXE_MIN_SRQ_WR)
+                       attr->max_wr = RXE_MIN_SRQ_WR;
+       }
+
+       if (mask & IB_SRQ_LIMIT) {
+               if (attr->srq_limit > rxe->attr.max_srq_wr) {
+                       pr_warn("srq_limit(%d) > max_srq_wr(%d)\n",
+                               attr->srq_limit, rxe->attr.max_srq_wr);
+                       goto err1;
+               }
+
+               if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) {
+                       pr_warn("srq_limit (%d) > cur limit(%d)\n",
+                               attr->srq_limit,
+                                srq->rq.queue->buf->index_mask);
+                       goto err1;
+               }
+       }
+
+       if (mask == IB_SRQ_INIT_MASK) {
+               if (attr->max_sge > rxe->attr.max_srq_sge) {
+                       pr_warn("max_sge(%d) > max_srq_sge(%d)\n",
+                               attr->max_sge, rxe->attr.max_srq_sge);
+                       goto err1;
+               }
+
+               if (attr->max_sge < RXE_MIN_SRQ_SGE)
+                       attr->max_sge = RXE_MIN_SRQ_SGE;
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_init_attr *init,
+                     struct ib_ucontext *context, struct ib_udata *udata)
+{
+       int err;
+       int srq_wqe_size;
+       struct rxe_queue *q;
+
+       srq->ibsrq.event_handler        = init->event_handler;
+       srq->ibsrq.srq_context          = init->srq_context;
+       srq->limit              = init->attr.srq_limit;
+       srq->srq_num            = srq->pelem.index;
+       srq->rq.max_wr          = init->attr.max_wr;
+       srq->rq.max_sge         = init->attr.max_sge;
+
+       srq_wqe_size            = rcv_wqe_size(srq->rq.max_sge);
+
+       spin_lock_init(&srq->rq.producer_lock);
+       spin_lock_init(&srq->rq.consumer_lock);
+
+       q = rxe_queue_init(rxe, &srq->rq.max_wr,
+                          srq_wqe_size);
+       if (!q) {
+               pr_warn("unable to allocate queue for srq\n");
+               return -ENOMEM;
+       }
+
+       srq->rq.queue = q;
+
+       err = do_mmap_info(rxe, udata, false, context, q->buf,
+                          q->buf_size, &q->ip);
+       if (err)
+               return err;
+
+       if (udata && udata->outlen >= sizeof(struct mminfo) + sizeof(u32)) {
+               if (copy_to_user(udata->outbuf + sizeof(struct mminfo),
+                                &srq->srq_num, sizeof(u32)))
+                       return -EFAULT;
+       }
+       return 0;
+}
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+                     struct ib_udata *udata)
+{
+       int err;
+       struct rxe_queue *q = srq->rq.queue;
+       struct mminfo mi = { .offset = 1, .size = 0};
+
+       if (mask & IB_SRQ_MAX_WR) {
+               /* Check that we can write the mminfo struct to user space */
+               if (udata && udata->inlen >= sizeof(__u64)) {
+                       __u64 mi_addr;
+
+                       /* Get address of user space mminfo struct */
+                       err = ib_copy_from_udata(&mi_addr, udata,
+                                                sizeof(mi_addr));
+                       if (err)
+                               goto err1;
+
+                       udata->outbuf = (void __user *)(unsigned long)mi_addr;
+                       udata->outlen = sizeof(mi);
+
+                       if (!access_ok(VERIFY_WRITE,
+                                      (void __user *)udata->outbuf,
+                                       udata->outlen)) {
+                               err = -EFAULT;
+                               goto err1;
+                       }
+               }
+
+               err = rxe_queue_resize(q, (unsigned int *)&attr->max_wr,
+                                      rcv_wqe_size(srq->rq.max_sge),
+                                      srq->rq.queue->ip ?
+                                               srq->rq.queue->ip->context :
+                                               NULL,
+                                      udata, &srq->rq.producer_lock,
+                                      &srq->rq.consumer_lock);
+               if (err)
+                       goto err2;
+       }
+
+       if (mask & IB_SRQ_LIMIT)
+               srq->limit = attr->srq_limit;
+
+       return 0;
+
+err2:
+       rxe_queue_cleanup(q);
+       srq->rq.queue = NULL;
+err1:
+       return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
new file mode 100644 (file)
index 0000000..cf8e778
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_net.h"
+
+/* Copy argument and remove trailing CR. Return the new length. */
+static int sanitize_arg(const char *val, char *intf, int intf_len)
+{
+       int len;
+
+       if (!val)
+               return 0;
+
+       /* Remove newline. */
+       for (len = 0; len < intf_len - 1 && val[len] && val[len] != '\n'; len++)
+               intf[len] = val[len];
+       intf[len] = 0;
+
+       if (len == 0 || (val[len] != 0 && val[len] != '\n'))
+               return 0;
+
+       return len;
+}
+
+static void rxe_set_port_state(struct net_device *ndev)
+{
+       struct rxe_dev *rxe = net_to_rxe(ndev);
+       bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
+
+       if (!rxe)
+               goto out;
+
+       if (is_up)
+               rxe_port_up(rxe);
+       else
+               rxe_port_down(rxe); /* down for unknown state */
+out:
+       return;
+}
+
+static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
+{
+       int len;
+       int err = 0;
+       char intf[32];
+       struct net_device *ndev = NULL;
+       struct rxe_dev *rxe;
+
+       len = sanitize_arg(val, intf, sizeof(intf));
+       if (!len) {
+               pr_err("rxe: add: invalid interface name\n");
+               err = -EINVAL;
+               goto err;
+       }
+
+       ndev = dev_get_by_name(&init_net, intf);
+       if (!ndev) {
+               pr_err("interface %s not found\n", intf);
+               err = -EINVAL;
+               goto err;
+       }
+
+       if (net_to_rxe(ndev)) {
+               pr_err("rxe: already configured on %s\n", intf);
+               err = -EINVAL;
+               goto err;
+       }
+
+       rxe = rxe_net_add(ndev);
+       if (!rxe) {
+               pr_err("rxe: failed to add %s\n", intf);
+               err = -EINVAL;
+               goto err;
+       }
+
+       rxe_set_port_state(ndev);
+       pr_info("rxe: added %s to %s\n", rxe->ib_dev.name, intf);
+err:
+       if (ndev)
+               dev_put(ndev);
+       return err;
+}
+
+static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
+{
+       int len;
+       char intf[32];
+       struct rxe_dev *rxe;
+
+       len = sanitize_arg(val, intf, sizeof(intf));
+       if (!len) {
+               pr_err("rxe: add: invalid interface name\n");
+               return -EINVAL;
+       }
+
+       if (strncmp("all", intf, len) == 0) {
+               pr_info("rxe_sys: remove all");
+               rxe_remove_all();
+               return 0;
+       }
+
+       rxe = get_rxe_by_name(intf);
+
+       if (!rxe) {
+               pr_err("rxe: not configured on %s\n", intf);
+               return -EINVAL;
+       }
+
+       list_del(&rxe->list);
+       rxe_remove(rxe);
+
+       return 0;
+}
+
+static const struct kernel_param_ops rxe_add_ops = {
+       .set = rxe_param_set_add,
+};
+
+static const struct kernel_param_ops rxe_remove_ops = {
+       .set = rxe_param_set_remove,
+};
+
+module_param_cb(add, &rxe_add_ops, NULL, 0200);
+MODULE_PARM_DESC(add, "Create RXE device over network interface");
+module_param_cb(remove, &rxe_remove_ops, NULL, 0200);
+MODULE_PARM_DESC(remove, "Remove RXE device over network interface");
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
new file mode 100644 (file)
index 0000000..1e19bf8
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/hardirq.h>
+
+#include "rxe_task.h"
+
+int __rxe_do_task(struct rxe_task *task)
+
+{
+       int ret;
+
+       while ((ret = task->func(task->arg)) == 0)
+               ;
+
+       task->ret = ret;
+
+       return ret;
+}
+
+/*
+ * this locking is due to a potential race where
+ * a second caller finds the task already running
+ * but looks just after the last call to func
+ */
+void rxe_do_task(unsigned long data)
+{
+       int cont;
+       int ret;
+       unsigned long flags;
+       struct rxe_task *task = (struct rxe_task *)data;
+
+       spin_lock_irqsave(&task->state_lock, flags);
+       switch (task->state) {
+       case TASK_STATE_START:
+               task->state = TASK_STATE_BUSY;
+               spin_unlock_irqrestore(&task->state_lock, flags);
+               break;
+
+       case TASK_STATE_BUSY:
+               task->state = TASK_STATE_ARMED;
+               /* fall through to */
+       case TASK_STATE_ARMED:
+               spin_unlock_irqrestore(&task->state_lock, flags);
+               return;
+
+       default:
+               spin_unlock_irqrestore(&task->state_lock, flags);
+               pr_warn("bad state = %d in rxe_do_task\n", task->state);
+               return;
+       }
+
+       do {
+               cont = 0;
+               ret = task->func(task->arg);
+
+               spin_lock_irqsave(&task->state_lock, flags);
+               switch (task->state) {
+               case TASK_STATE_BUSY:
+                       if (ret)
+                               task->state = TASK_STATE_START;
+                       else
+                               cont = 1;
+                       break;
+
+               /* soneone tried to run the task since the last time we called
+                * func, so we will call one more time regardless of the
+                * return value
+                */
+               case TASK_STATE_ARMED:
+                       task->state = TASK_STATE_BUSY;
+                       cont = 1;
+                       break;
+
+               default:
+                       pr_warn("bad state = %d in rxe_do_task\n",
+                               task->state);
+               }
+               spin_unlock_irqrestore(&task->state_lock, flags);
+       } while (cont);
+
+       task->ret = ret;
+}
+
+int rxe_init_task(void *obj, struct rxe_task *task,
+                 void *arg, int (*func)(void *), char *name)
+{
+       task->obj       = obj;
+       task->arg       = arg;
+       task->func      = func;
+       snprintf(task->name, sizeof(task->name), "%s", name);
+
+       tasklet_init(&task->tasklet, rxe_do_task, (unsigned long)task);
+
+       task->state = TASK_STATE_START;
+       spin_lock_init(&task->state_lock);
+
+       return 0;
+}
+
+void rxe_cleanup_task(struct rxe_task *task)
+{
+       tasklet_kill(&task->tasklet);
+}
+
+void rxe_run_task(struct rxe_task *task, int sched)
+{
+       if (sched)
+               tasklet_schedule(&task->tasklet);
+       else
+               rxe_do_task((unsigned long)task);
+}
+
+void rxe_disable_task(struct rxe_task *task)
+{
+       tasklet_disable(&task->tasklet);
+}
+
+void rxe_enable_task(struct rxe_task *task)
+{
+       tasklet_enable(&task->tasklet);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
new file mode 100644 (file)
index 0000000..d14aa6d
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_TASK_H
+#define RXE_TASK_H
+
+enum {
+       TASK_STATE_START        = 0,
+       TASK_STATE_BUSY         = 1,
+       TASK_STATE_ARMED        = 2,
+};
+
+/*
+ * data structure to describe a 'task' which is a short
+ * function that returns 0 as long as it needs to be
+ * called again.
+ */
+struct rxe_task {
+       void                    *obj;
+       struct tasklet_struct   tasklet;
+       int                     state;
+       spinlock_t              state_lock; /* spinlock for task state */
+       void                    *arg;
+       int                     (*func)(void *arg);
+       int                     ret;
+       char                    name[16];
+};
+
+/*
+ * init rxe_task structure
+ *     arg  => parameter to pass to fcn
+ *     fcn  => function to call until it returns != 0
+ */
+int rxe_init_task(void *obj, struct rxe_task *task,
+                 void *arg, int (*func)(void *), char *name);
+
+/* cleanup task */
+void rxe_cleanup_task(struct rxe_task *task);
+
+/*
+ * raw call to func in loop without any checking
+ * can call when tasklets are disabled
+ */
+int __rxe_do_task(struct rxe_task *task);
+
+/*
+ * common function called by any of the main tasklets
+ * If there is any chance that there is additional
+ * work to do someone must reschedule the task before
+ * leaving
+ */
+void rxe_do_task(unsigned long data);
+
+/* run a task, else schedule it to run as a tasklet, The decision
+ * to run or schedule tasklet is based on the parameter sched.
+ */
+void rxe_run_task(struct rxe_task *task, int sched);
+
+/* keep a task from scheduling */
+void rxe_disable_task(struct rxe_task *task);
+
+/* allow task to run */
+void rxe_enable_task(struct rxe_task *task);
+
+#endif /* RXE_TASK_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
new file mode 100644 (file)
index 0000000..4552be9
--- /dev/null
@@ -0,0 +1,1330 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int rxe_query_device(struct ib_device *dev,
+                           struct ib_device_attr *attr,
+                           struct ib_udata *uhw)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+
+       if (uhw->inlen || uhw->outlen)
+               return -EINVAL;
+
+       *attr = rxe->attr;
+       return 0;
+}
+
+static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed,
+                                     u8 *active_width)
+{
+       if (speed <= 1000) {
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_SDR;
+       } else if (speed <= 10000) {
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_FDR10;
+       } else if (speed <= 20000) {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_DDR;
+       } else if (speed <= 30000) {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_QDR;
+       } else if (speed <= 40000) {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_FDR10;
+       } else {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_EDR;
+       }
+}
+
+static int rxe_query_port(struct ib_device *dev,
+                         u8 port_num, struct ib_port_attr *attr)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_port *port;
+       u32 speed;
+
+       if (unlikely(port_num != 1)) {
+               pr_warn("invalid port_number %d\n", port_num);
+               goto err1;
+       }
+
+       port = &rxe->port;
+
+       *attr = port->attr;
+
+       mutex_lock(&rxe->usdev_lock);
+       if (rxe->ndev->ethtool_ops->get_link_ksettings) {
+               struct ethtool_link_ksettings ks;
+
+               rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks);
+               speed = ks.base.speed;
+       } else if (rxe->ndev->ethtool_ops->get_settings) {
+               struct ethtool_cmd cmd;
+
+               rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd);
+               speed = cmd.speed;
+       } else {
+               pr_warn("%s speed is unknown, defaulting to 1000\n", rxe->ndev->name);
+               speed = 1000;
+       }
+       rxe_eth_speed_to_ib_speed(speed, &attr->active_speed, &attr->active_width);
+       mutex_unlock(&rxe->usdev_lock);
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int rxe_query_gid(struct ib_device *device,
+                        u8 port_num, int index, union ib_gid *gid)
+{
+       int ret;
+
+       if (index > RXE_PORT_GID_TBL_LEN)
+               return -EINVAL;
+
+       ret = ib_get_cached_gid(device, port_num, index, gid, NULL);
+       if (ret == -EAGAIN) {
+               memcpy(gid, &zgid, sizeof(*gid));
+               return 0;
+       }
+
+       return ret;
+}
+
+static int rxe_add_gid(struct ib_device *device, u8 port_num, unsigned int
+                      index, const union ib_gid *gid,
+                      const struct ib_gid_attr *attr, void **context)
+{
+       if (index >= RXE_PORT_GID_TBL_LEN)
+               return -EINVAL;
+       return 0;
+}
+
+static int rxe_del_gid(struct ib_device *device, u8 port_num, unsigned int
+                      index, void **context)
+{
+       if (index >= RXE_PORT_GID_TBL_LEN)
+               return -EINVAL;
+       return 0;
+}
+
+static struct net_device *rxe_get_netdev(struct ib_device *device,
+                                        u8 port_num)
+{
+       struct rxe_dev *rxe = to_rdev(device);
+
+       if (rxe->ndev) {
+               dev_hold(rxe->ndev);
+               return rxe->ndev;
+       }
+
+       return NULL;
+}
+
+static int rxe_query_pkey(struct ib_device *device,
+                         u8 port_num, u16 index, u16 *pkey)
+{
+       struct rxe_dev *rxe = to_rdev(device);
+       struct rxe_port *port;
+
+       if (unlikely(port_num != 1)) {
+               dev_warn(device->dma_device, "invalid port_num = %d\n",
+                        port_num);
+               goto err1;
+       }
+
+       port = &rxe->port;
+
+       if (unlikely(index >= port->attr.pkey_tbl_len)) {
+               dev_warn(device->dma_device, "invalid index = %d\n",
+                        index);
+               goto err1;
+       }
+
+       *pkey = port->pkey_tbl[index];
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int rxe_modify_device(struct ib_device *dev,
+                            int mask, struct ib_device_modify *attr)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+
+       if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+               rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid);
+
+       if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+               memcpy(rxe->ib_dev.node_desc,
+                      attr->node_desc, sizeof(rxe->ib_dev.node_desc));
+       }
+
+       return 0;
+}
+
+static int rxe_modify_port(struct ib_device *dev,
+                          u8 port_num, int mask, struct ib_port_modify *attr)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_port *port;
+
+       if (unlikely(port_num != 1)) {
+               pr_warn("invalid port_num = %d\n", port_num);
+               goto err1;
+       }
+
+       port = &rxe->port;
+
+       port->attr.port_cap_flags |= attr->set_port_cap_mask;
+       port->attr.port_cap_flags &= ~attr->clr_port_cap_mask;
+
+       if (mask & IB_PORT_RESET_QKEY_CNTR)
+               port->attr.qkey_viol_cntr = 0;
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
+                                              u8 port_num)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+
+       return rxe->ifc_ops->link_layer(rxe, port_num);
+}
+
+static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
+                                             struct ib_udata *udata)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_ucontext *uc;
+
+       uc = rxe_alloc(&rxe->uc_pool);
+       return uc ? &uc->ibuc : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+{
+       struct rxe_ucontext *uc = to_ruc(ibuc);
+
+       rxe_drop_ref(uc);
+       return 0;
+}
+
+static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
+                             struct ib_port_immutable *immutable)
+{
+       int err;
+       struct ib_port_attr attr;
+
+       err = rxe_query_port(dev, port_num, &attr);
+       if (err)
+               return err;
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+       return 0;
+}
+
+static struct ib_pd *rxe_alloc_pd(struct ib_device *dev,
+                                 struct ib_ucontext *context,
+                                 struct ib_udata *udata)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_pd *pd;
+
+       pd = rxe_alloc(&rxe->pd_pool);
+       return pd ? &pd->ibpd : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_pd(struct ib_pd *ibpd)
+{
+       struct rxe_pd *pd = to_rpd(ibpd);
+
+       rxe_drop_ref(pd);
+       return 0;
+}
+
+static int rxe_init_av(struct rxe_dev *rxe, struct ib_ah_attr *attr,
+                      struct rxe_av *av)
+{
+       int err;
+       union ib_gid sgid;
+       struct ib_gid_attr sgid_attr;
+
+       err = ib_get_cached_gid(&rxe->ib_dev, attr->port_num,
+                               attr->grh.sgid_index, &sgid,
+                               &sgid_attr);
+       if (err) {
+               pr_err("Failed to query sgid. err = %d\n", err);
+               return err;
+       }
+
+       err = rxe_av_from_attr(rxe, attr->port_num, av, attr);
+       if (!err)
+               err = rxe_av_fill_ip_info(rxe, av, attr, &sgid_attr, &sgid);
+
+       if (sgid_attr.ndev)
+               dev_put(sgid_attr.ndev);
+       return err;
+}
+
+static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_ah *ah;
+
+       err = rxe_av_chk_attr(rxe, attr);
+       if (err)
+               goto err1;
+
+       ah = rxe_alloc(&rxe->ah_pool);
+       if (!ah) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_ref(pd);
+       ah->pd = pd;
+
+       err = rxe_init_av(rxe, attr, &ah->av);
+       if (err)
+               goto err2;
+
+       return &ah->ibah;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_ref(ah);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibah->device);
+       struct rxe_ah *ah = to_rah(ibah);
+
+       err = rxe_av_chk_attr(rxe, attr);
+       if (err)
+               return err;
+
+       err = rxe_init_av(rxe, attr, &ah->av);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int rxe_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+       struct rxe_dev *rxe = to_rdev(ibah->device);
+       struct rxe_ah *ah = to_rah(ibah);
+
+       rxe_av_to_attr(rxe, &ah->av, attr);
+       return 0;
+}
+
+static int rxe_destroy_ah(struct ib_ah *ibah)
+{
+       struct rxe_ah *ah = to_rah(ibah);
+
+       rxe_drop_ref(ah->pd);
+       rxe_drop_ref(ah);
+       return 0;
+}
+
+static int post_one_recv(struct rxe_rq *rq, struct ib_recv_wr *ibwr)
+{
+       int err;
+       int i;
+       u32 length;
+       struct rxe_recv_wqe *recv_wqe;
+       int num_sge = ibwr->num_sge;
+
+       if (unlikely(queue_full(rq->queue))) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       if (unlikely(num_sge > rq->max_sge)) {
+               err = -EINVAL;
+               goto err1;
+       }
+
+       length = 0;
+       for (i = 0; i < num_sge; i++)
+               length += ibwr->sg_list[i].length;
+
+       recv_wqe = producer_addr(rq->queue);
+       recv_wqe->wr_id = ibwr->wr_id;
+       recv_wqe->num_sge = num_sge;
+
+       memcpy(recv_wqe->dma.sge, ibwr->sg_list,
+              num_sge * sizeof(struct ib_sge));
+
+       recv_wqe->dma.length            = length;
+       recv_wqe->dma.resid             = length;
+       recv_wqe->dma.num_sge           = num_sge;
+       recv_wqe->dma.cur_sge           = 0;
+       recv_wqe->dma.sge_offset        = 0;
+
+       /* make sure all changes to the work queue are written before we
+        * update the producer pointer
+        */
+       smp_wmb();
+
+       advance_producer(rq->queue);
+       return 0;
+
+err1:
+       return err;
+}
+
+static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
+                                    struct ib_srq_init_attr *init,
+                                    struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_srq *srq;
+       struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+
+       err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK);
+       if (err)
+               goto err1;
+
+       srq = rxe_alloc(&rxe->srq_pool);
+       if (!srq) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_index(srq);
+       rxe_add_ref(pd);
+       srq->pd = pd;
+
+       err = rxe_srq_from_init(rxe, srq, init, context, udata);
+       if (err)
+               goto err2;
+
+       return &srq->ibsrq;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_index(srq);
+       rxe_drop_ref(srq);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                         enum ib_srq_attr_mask mask,
+                         struct ib_udata *udata)
+{
+       int err;
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+       struct rxe_dev *rxe = to_rdev(ibsrq->device);
+
+       err = rxe_srq_chk_attr(rxe, srq, attr, mask);
+       if (err)
+               goto err1;
+
+       err = rxe_srq_from_attr(rxe, srq, attr, mask, udata);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+
+       if (srq->error)
+               return -EINVAL;
+
+       attr->max_wr = srq->rq.queue->buf->index_mask;
+       attr->max_sge = srq->rq.max_sge;
+       attr->srq_limit = srq->limit;
+       return 0;
+}
+
+static int rxe_destroy_srq(struct ib_srq *ibsrq)
+{
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+
+       if (srq->rq.queue)
+               rxe_queue_cleanup(srq->rq.queue);
+
+       rxe_drop_ref(srq->pd);
+       rxe_drop_index(srq);
+       rxe_drop_ref(srq);
+
+       return 0;
+}
+
+static int rxe_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                            struct ib_recv_wr **bad_wr)
+{
+       int err = 0;
+       unsigned long flags;
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+
+       spin_lock_irqsave(&srq->rq.producer_lock, flags);
+
+       while (wr) {
+               err = post_one_recv(&srq->rq, wr);
+               if (unlikely(err))
+                       break;
+               wr = wr->next;
+       }
+
+       spin_unlock_irqrestore(&srq->rq.producer_lock, flags);
+
+       if (err)
+               *bad_wr = wr;
+
+       return err;
+}
+
+static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd,
+                                  struct ib_qp_init_attr *init,
+                                  struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_qp *qp;
+
+       err = rxe_qp_chk_init(rxe, init);
+       if (err)
+               goto err1;
+
+       qp = rxe_alloc(&rxe->qp_pool);
+       if (!qp) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       if (udata) {
+               if (udata->inlen) {
+                       err = -EINVAL;
+                       goto err1;
+               }
+               qp->is_user = 1;
+       }
+
+       rxe_add_index(qp);
+
+       err = rxe_qp_from_init(rxe, qp, pd, init, udata, ibpd);
+       if (err)
+               goto err2;
+
+       return &qp->ibqp;
+
+err2:
+       rxe_drop_index(qp);
+       rxe_drop_ref(qp);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                        int mask, struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibqp->device);
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       err = rxe_qp_chk_attr(rxe, qp, attr, mask);
+       if (err)
+               goto err1;
+
+       err = rxe_qp_from_attr(qp, attr, mask, udata);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                       int mask, struct ib_qp_init_attr *init)
+{
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       rxe_qp_to_init(qp, init);
+       rxe_qp_to_attr(qp, attr, mask);
+
+       return 0;
+}
+
+static int rxe_destroy_qp(struct ib_qp *ibqp)
+{
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       rxe_qp_destroy(qp);
+       rxe_drop_index(qp);
+       rxe_drop_ref(qp);
+       return 0;
+}
+
+static int validate_send_wr(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+                           unsigned int mask, unsigned int length)
+{
+       int num_sge = ibwr->num_sge;
+       struct rxe_sq *sq = &qp->sq;
+
+       if (unlikely(num_sge > sq->max_sge))
+               goto err1;
+
+       if (unlikely(mask & WR_ATOMIC_MASK)) {
+               if (length < 8)
+                       goto err1;
+
+               if (atomic_wr(ibwr)->remote_addr & 0x7)
+                       goto err1;
+       }
+
+       if (unlikely((ibwr->send_flags & IB_SEND_INLINE) &&
+                    (length > sq->max_inline)))
+               goto err1;
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr,
+                        struct ib_send_wr *ibwr)
+{
+       wr->wr_id = ibwr->wr_id;
+       wr->num_sge = ibwr->num_sge;
+       wr->opcode = ibwr->opcode;
+       wr->send_flags = ibwr->send_flags;
+
+       if (qp_type(qp) == IB_QPT_UD ||
+           qp_type(qp) == IB_QPT_SMI ||
+           qp_type(qp) == IB_QPT_GSI) {
+               wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn;
+               wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey;
+               if (qp_type(qp) == IB_QPT_GSI)
+                       wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index;
+               if (wr->opcode == IB_WR_SEND_WITH_IMM)
+                       wr->ex.imm_data = ibwr->ex.imm_data;
+       } else {
+               switch (wr->opcode) {
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       wr->ex.imm_data = ibwr->ex.imm_data;
+               case IB_WR_RDMA_READ:
+               case IB_WR_RDMA_WRITE:
+                       wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr;
+                       wr->wr.rdma.rkey        = rdma_wr(ibwr)->rkey;
+                       break;
+               case IB_WR_SEND_WITH_IMM:
+                       wr->ex.imm_data = ibwr->ex.imm_data;
+                       break;
+               case IB_WR_SEND_WITH_INV:
+                       wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+                       break;
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+                       wr->wr.atomic.remote_addr =
+                               atomic_wr(ibwr)->remote_addr;
+                       wr->wr.atomic.compare_add =
+                               atomic_wr(ibwr)->compare_add;
+                       wr->wr.atomic.swap = atomic_wr(ibwr)->swap;
+                       wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey;
+                       break;
+               case IB_WR_LOCAL_INV:
+                       wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+               break;
+               case IB_WR_REG_MR:
+                       wr->wr.reg.mr = reg_wr(ibwr)->mr;
+                       wr->wr.reg.key = reg_wr(ibwr)->key;
+                       wr->wr.reg.access = reg_wr(ibwr)->access;
+               break;
+               default:
+                       break;
+               }
+       }
+}
+
+static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+                        unsigned int mask, unsigned int length,
+                        struct rxe_send_wqe *wqe)
+{
+       int num_sge = ibwr->num_sge;
+       struct ib_sge *sge;
+       int i;
+       u8 *p;
+
+       init_send_wr(qp, &wqe->wr, ibwr);
+
+       if (qp_type(qp) == IB_QPT_UD ||
+           qp_type(qp) == IB_QPT_SMI ||
+           qp_type(qp) == IB_QPT_GSI)
+               memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
+
+       if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) {
+               p = wqe->dma.inline_data;
+
+               sge = ibwr->sg_list;
+               for (i = 0; i < num_sge; i++, sge++) {
+                       if (qp->is_user && copy_from_user(p, (__user void *)
+                                           (uintptr_t)sge->addr, sge->length))
+                               return -EFAULT;
+
+                       else if (!qp->is_user)
+                               memcpy(p, (void *)(uintptr_t)sge->addr,
+                                      sge->length);
+
+                       p += sge->length;
+               }
+       } else if (mask & WR_REG_MASK) {
+               wqe->mask = mask;
+               wqe->state = wqe_state_posted;
+               return 0;
+       } else
+               memcpy(wqe->dma.sge, ibwr->sg_list,
+                      num_sge * sizeof(struct ib_sge));
+
+       wqe->iova               = (mask & WR_ATOMIC_MASK) ?
+                                       atomic_wr(ibwr)->remote_addr :
+                                       rdma_wr(ibwr)->remote_addr;
+       wqe->mask               = mask;
+       wqe->dma.length         = length;
+       wqe->dma.resid          = length;
+       wqe->dma.num_sge        = num_sge;
+       wqe->dma.cur_sge        = 0;
+       wqe->dma.sge_offset     = 0;
+       wqe->state              = wqe_state_posted;
+       wqe->ssn                = atomic_add_return(1, &qp->ssn);
+
+       return 0;
+}
+
+static int post_one_send(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+                        unsigned mask, u32 length)
+{
+       int err;
+       struct rxe_sq *sq = &qp->sq;
+       struct rxe_send_wqe *send_wqe;
+       unsigned long flags;
+
+       err = validate_send_wr(qp, ibwr, mask, length);
+       if (err)
+               return err;
+
+       spin_lock_irqsave(&qp->sq.sq_lock, flags);
+
+       if (unlikely(queue_full(sq->queue))) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       send_wqe = producer_addr(sq->queue);
+
+       err = init_send_wqe(qp, ibwr, mask, length, send_wqe);
+       if (unlikely(err))
+               goto err1;
+
+       /*
+        * make sure all changes to the work queue are
+        * written before we update the producer pointer
+        */
+       smp_wmb();
+
+       advance_producer(sq->queue);
+       spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+
+       return 0;
+
+err1:
+       spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+       return err;
+}
+
+static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                        struct ib_send_wr **bad_wr)
+{
+       int err = 0;
+       struct rxe_qp *qp = to_rqp(ibqp);
+       unsigned int mask;
+       unsigned int length = 0;
+       int i;
+       int must_sched;
+
+       if (unlikely(!qp->valid)) {
+               *bad_wr = wr;
+               return -EINVAL;
+       }
+
+       if (unlikely(qp->req.state < QP_STATE_READY)) {
+               *bad_wr = wr;
+               return -EINVAL;
+       }
+
+       while (wr) {
+               mask = wr_opcode_mask(wr->opcode, qp);
+               if (unlikely(!mask)) {
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       break;
+               }
+
+               if (unlikely((wr->send_flags & IB_SEND_INLINE) &&
+                            !(mask & WR_INLINE_MASK))) {
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       break;
+               }
+
+               length = 0;
+               for (i = 0; i < wr->num_sge; i++)
+                       length += wr->sg_list[i].length;
+
+               err = post_one_send(qp, wr, mask, length);
+
+               if (err) {
+                       *bad_wr = wr;
+                       break;
+               }
+               wr = wr->next;
+       }
+
+       /*
+        * Must sched in case of GSI QP because ib_send_mad() hold irq lock,
+        * and the requester call ip_local_out_sk() that takes spin_lock_bh.
+        */
+       must_sched = (qp_type(qp) == IB_QPT_GSI) ||
+                       (queue_count(qp->sq.queue) > 1);
+
+       rxe_run_task(&qp->req.task, must_sched);
+
+       return err;
+}
+
+static int rxe_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                        struct ib_recv_wr **bad_wr)
+{
+       int err = 0;
+       struct rxe_qp *qp = to_rqp(ibqp);
+       struct rxe_rq *rq = &qp->rq;
+       unsigned long flags;
+
+       if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) {
+               *bad_wr = wr;
+               err = -EINVAL;
+               goto err1;
+       }
+
+       if (unlikely(qp->srq)) {
+               *bad_wr = wr;
+               err = -EINVAL;
+               goto err1;
+       }
+
+       spin_lock_irqsave(&rq->producer_lock, flags);
+
+       while (wr) {
+               err = post_one_recv(rq, wr);
+               if (unlikely(err)) {
+                       *bad_wr = wr;
+                       break;
+               }
+               wr = wr->next;
+       }
+
+       spin_unlock_irqrestore(&rq->producer_lock, flags);
+
+err1:
+       return err;
+}
+
+static struct ib_cq *rxe_create_cq(struct ib_device *dev,
+                                  const struct ib_cq_init_attr *attr,
+                                  struct ib_ucontext *context,
+                                  struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_cq *cq;
+
+       if (attr->flags)
+               return ERR_PTR(-EINVAL);
+
+       err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector, udata);
+       if (err)
+               goto err1;
+
+       cq = rxe_alloc(&rxe->cq_pool);
+       if (!cq) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector,
+                              context, udata);
+       if (err)
+               goto err2;
+
+       return &cq->ibcq;
+
+err2:
+       rxe_drop_ref(cq);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_destroy_cq(struct ib_cq *ibcq)
+{
+       struct rxe_cq *cq = to_rcq(ibcq);
+
+       rxe_drop_ref(cq);
+       return 0;
+}
+
+static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+       int err;
+       struct rxe_cq *cq = to_rcq(ibcq);
+       struct rxe_dev *rxe = to_rdev(ibcq->device);
+
+       err = rxe_cq_chk_attr(rxe, cq, cqe, 0, udata);
+       if (err)
+               goto err1;
+
+       err = rxe_cq_resize_queue(cq, cqe, udata);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+       int i;
+       struct rxe_cq *cq = to_rcq(ibcq);
+       struct rxe_cqe *cqe;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->cq_lock, flags);
+       for (i = 0; i < num_entries; i++) {
+               cqe = queue_head(cq->queue);
+               if (!cqe)
+                       break;
+
+               memcpy(wc++, &cqe->ibwc, sizeof(*wc));
+               advance_consumer(cq->queue);
+       }
+       spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+       return i;
+}
+
+static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt)
+{
+       struct rxe_cq *cq = to_rcq(ibcq);
+       int count = queue_count(cq->queue);
+
+       return (count > wc_cnt) ? wc_cnt : count;
+}
+
+static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+       struct rxe_cq *cq = to_rcq(ibcq);
+
+       if (cq->notify != IB_CQ_NEXT_COMP)
+               cq->notify = flags & IB_CQ_SOLICITED_MASK;
+
+       return 0;
+}
+
+static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_mem *mr;
+       int err;
+
+       mr = rxe_alloc(&rxe->mr_pool);
+       if (!mr) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_index(mr);
+
+       rxe_add_ref(pd);
+
+       err = rxe_mem_init_dma(rxe, pd, access, mr);
+       if (err)
+               goto err2;
+
+       return &mr->ibmr;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+err1:
+       return ERR_PTR(err);
+}
+
+static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
+                                    u64 start,
+                                    u64 length,
+                                    u64 iova,
+                                    int access, struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_mem *mr;
+
+       mr = rxe_alloc(&rxe->mr_pool);
+       if (!mr) {
+               err = -ENOMEM;
+               goto err2;
+       }
+
+       rxe_add_index(mr);
+
+       rxe_add_ref(pd);
+
+       err = rxe_mem_init_user(rxe, pd, start, length, iova,
+                               access, udata, mr);
+       if (err)
+               goto err3;
+
+       return &mr->ibmr;
+
+err3:
+       rxe_drop_ref(pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+err2:
+       return ERR_PTR(err);
+}
+
+static int rxe_dereg_mr(struct ib_mr *ibmr)
+{
+       struct rxe_mem *mr = to_rmr(ibmr);
+
+       mr->state = RXE_MEM_STATE_ZOMBIE;
+       rxe_drop_ref(mr->pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+       return 0;
+}
+
+static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd,
+                                 enum ib_mr_type mr_type,
+                                 u32 max_num_sg)
+{
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_mem *mr;
+       int err;
+
+       if (mr_type != IB_MR_TYPE_MEM_REG)
+               return ERR_PTR(-EINVAL);
+
+       mr = rxe_alloc(&rxe->mr_pool);
+       if (!mr) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_index(mr);
+
+       rxe_add_ref(pd);
+
+       err = rxe_mem_init_fast(rxe, pd, max_num_sg, mr);
+       if (err)
+               goto err2;
+
+       return &mr->ibmr;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
+{
+       struct rxe_mem *mr = to_rmr(ibmr);
+       struct rxe_map *map;
+       struct rxe_phys_buf *buf;
+
+       if (unlikely(mr->nbuf == mr->num_buf))
+               return -ENOMEM;
+
+       map = mr->map[mr->nbuf / RXE_BUF_PER_MAP];
+       buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP];
+
+       buf->addr = addr;
+       buf->size = ibmr->page_size;
+       mr->nbuf++;
+
+       return 0;
+}
+
+static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                        unsigned int *sg_offset)
+{
+       struct rxe_mem *mr = to_rmr(ibmr);
+       int n;
+
+       mr->nbuf = 0;
+
+       n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page);
+
+       mr->va = ibmr->iova;
+       mr->iova = ibmr->iova;
+       mr->length = ibmr->length;
+       mr->page_shift = ilog2(ibmr->page_size);
+       mr->page_mask = ibmr->page_size - 1;
+       mr->offset = mr->iova & mr->page_mask;
+
+       return n;
+}
+
+static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibqp->device);
+       struct rxe_qp *qp = to_rqp(ibqp);
+       struct rxe_mc_grp *grp;
+
+       /* takes a ref on grp if successful */
+       err = rxe_mcast_get_grp(rxe, mgid, &grp);
+       if (err)
+               return err;
+
+       err = rxe_mcast_add_grp_elem(rxe, qp, grp);
+
+       rxe_drop_ref(grp);
+       return err;
+}
+
+static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+       struct rxe_dev *rxe = to_rdev(ibqp->device);
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       return rxe_mcast_drop_grp_elem(rxe, qp, mgid);
+}
+
+static ssize_t rxe_show_parent(struct device *device,
+                              struct device_attribute *attr, char *buf)
+{
+       struct rxe_dev *rxe = container_of(device, struct rxe_dev,
+                                          ib_dev.dev);
+       char *name;
+
+       name = rxe->ifc_ops->parent_name(rxe, 1);
+       return snprintf(buf, 16, "%s\n", name);
+}
+
+static DEVICE_ATTR(parent, S_IRUGO, rxe_show_parent, NULL);
+
+static struct device_attribute *rxe_dev_attributes[] = {
+       &dev_attr_parent,
+};
+
+int rxe_register_device(struct rxe_dev *rxe)
+{
+       int err;
+       int i;
+       struct ib_device *dev = &rxe->ib_dev;
+
+       strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX);
+       strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
+
+       dev->owner = THIS_MODULE;
+       dev->node_type = RDMA_NODE_IB_CA;
+       dev->phys_port_cnt = 1;
+       dev->num_comp_vectors = RXE_NUM_COMP_VECTORS;
+       dev->dma_device = rxe->ifc_ops->dma_device(rxe);
+       dev->local_dma_lkey = 0;
+       dev->node_guid = rxe->ifc_ops->node_guid(rxe);
+       dev->dma_ops = &rxe_dma_mapping_ops;
+
+       dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
+       dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT)
+           | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD)
+           | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_POST_SRQ_RECV)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND)
+           | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_PEEK_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_REG_MR)
+           | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST)
+           | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST)
+           ;
+
+       dev->query_device = rxe_query_device;
+       dev->modify_device = rxe_modify_device;
+       dev->query_port = rxe_query_port;
+       dev->modify_port = rxe_modify_port;
+       dev->get_link_layer = rxe_get_link_layer;
+       dev->query_gid = rxe_query_gid;
+       dev->get_netdev = rxe_get_netdev;
+       dev->add_gid = rxe_add_gid;
+       dev->del_gid = rxe_del_gid;
+       dev->query_pkey = rxe_query_pkey;
+       dev->alloc_ucontext = rxe_alloc_ucontext;
+       dev->dealloc_ucontext = rxe_dealloc_ucontext;
+       dev->mmap = rxe_mmap;
+       dev->get_port_immutable = rxe_port_immutable;
+       dev->alloc_pd = rxe_alloc_pd;
+       dev->dealloc_pd = rxe_dealloc_pd;
+       dev->create_ah = rxe_create_ah;
+       dev->modify_ah = rxe_modify_ah;
+       dev->query_ah = rxe_query_ah;
+       dev->destroy_ah = rxe_destroy_ah;
+       dev->create_srq = rxe_create_srq;
+       dev->modify_srq = rxe_modify_srq;
+       dev->query_srq = rxe_query_srq;
+       dev->destroy_srq = rxe_destroy_srq;
+       dev->post_srq_recv = rxe_post_srq_recv;
+       dev->create_qp = rxe_create_qp;
+       dev->modify_qp = rxe_modify_qp;
+       dev->query_qp = rxe_query_qp;
+       dev->destroy_qp = rxe_destroy_qp;
+       dev->post_send = rxe_post_send;
+       dev->post_recv = rxe_post_recv;
+       dev->create_cq = rxe_create_cq;
+       dev->destroy_cq = rxe_destroy_cq;
+       dev->resize_cq = rxe_resize_cq;
+       dev->poll_cq = rxe_poll_cq;
+       dev->peek_cq = rxe_peek_cq;
+       dev->req_notify_cq = rxe_req_notify_cq;
+       dev->get_dma_mr = rxe_get_dma_mr;
+       dev->reg_user_mr = rxe_reg_user_mr;
+       dev->dereg_mr = rxe_dereg_mr;
+       dev->alloc_mr = rxe_alloc_mr;
+       dev->map_mr_sg = rxe_map_mr_sg;
+       dev->attach_mcast = rxe_attach_mcast;
+       dev->detach_mcast = rxe_detach_mcast;
+
+       err = ib_register_device(dev, NULL);
+       if (err) {
+               pr_warn("rxe_register_device failed, err = %d\n", err);
+               goto err1;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) {
+               err = device_create_file(&dev->dev, rxe_dev_attributes[i]);
+               if (err) {
+                       pr_warn("device_create_file failed, i = %d, err = %d\n",
+                               i, err);
+                       goto err2;
+               }
+       }
+
+       return 0;
+
+err2:
+       ib_unregister_device(dev);
+err1:
+       return err;
+}
+
+int rxe_unregister_device(struct rxe_dev *rxe)
+{
+       int i;
+       struct ib_device *dev = &rxe->ib_dev;
+
+       for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i)
+               device_remove_file(&dev->dev, rxe_dev_attributes[i]);
+
+       ib_unregister_device(dev);
+
+       return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
new file mode 100644 (file)
index 0000000..cac1d52
--- /dev/null
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_VERBS_H
+#define RXE_VERBS_H
+
+#include <linux/interrupt.h>
+#include <rdma/rdma_user_rxe.h>
+#include "rxe_pool.h"
+#include "rxe_task.h"
+
+static inline int pkey_match(u16 key1, u16 key2)
+{
+       return (((key1 & 0x7fff) != 0) &&
+               ((key1 & 0x7fff) == (key2 & 0x7fff)) &&
+               ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0;
+}
+
+/* Return >0 if psn_a > psn_b
+ *        0 if psn_a == psn_b
+ *       <0 if psn_a < psn_b
+ */
+static inline int psn_compare(u32 psn_a, u32 psn_b)
+{
+       s32 diff;
+
+       diff = (psn_a - psn_b) << 8;
+       return diff;
+}
+
+struct rxe_ucontext {
+       struct rxe_pool_entry   pelem;
+       struct ib_ucontext      ibuc;
+};
+
+struct rxe_pd {
+       struct rxe_pool_entry   pelem;
+       struct ib_pd            ibpd;
+};
+
+struct rxe_ah {
+       struct rxe_pool_entry   pelem;
+       struct ib_ah            ibah;
+       struct rxe_pd           *pd;
+       struct rxe_av           av;
+};
+
+struct rxe_cqe {
+       union {
+               struct ib_wc            ibwc;
+               struct ib_uverbs_wc     uibwc;
+       };
+};
+
+struct rxe_cq {
+       struct rxe_pool_entry   pelem;
+       struct ib_cq            ibcq;
+       struct rxe_queue        *queue;
+       spinlock_t              cq_lock;
+       u8                      notify;
+       int                     is_user;
+       struct tasklet_struct   comp_task;
+};
+
+enum wqe_state {
+       wqe_state_posted,
+       wqe_state_processing,
+       wqe_state_pending,
+       wqe_state_done,
+       wqe_state_error,
+};
+
+struct rxe_sq {
+       int                     max_wr;
+       int                     max_sge;
+       int                     max_inline;
+       spinlock_t              sq_lock; /* guard queue */
+       struct rxe_queue        *queue;
+};
+
+struct rxe_rq {
+       int                     max_wr;
+       int                     max_sge;
+       spinlock_t              producer_lock; /* guard queue producer */
+       spinlock_t              consumer_lock; /* guard queue consumer */
+       struct rxe_queue        *queue;
+};
+
+struct rxe_srq {
+       struct rxe_pool_entry   pelem;
+       struct ib_srq           ibsrq;
+       struct rxe_pd           *pd;
+       struct rxe_rq           rq;
+       u32                     srq_num;
+
+       int                     limit;
+       int                     error;
+};
+
+enum rxe_qp_state {
+       QP_STATE_RESET,
+       QP_STATE_INIT,
+       QP_STATE_READY,
+       QP_STATE_DRAIN,         /* req only */
+       QP_STATE_DRAINED,       /* req only */
+       QP_STATE_ERROR
+};
+
+extern char *rxe_qp_state_name[];
+
+struct rxe_req_info {
+       enum rxe_qp_state       state;
+       int                     wqe_index;
+       u32                     psn;
+       int                     opcode;
+       atomic_t                rd_atomic;
+       int                     wait_fence;
+       int                     need_rd_atomic;
+       int                     wait_psn;
+       int                     need_retry;
+       int                     noack_pkts;
+       struct rxe_task         task;
+};
+
+struct rxe_comp_info {
+       u32                     psn;
+       int                     opcode;
+       int                     timeout;
+       int                     timeout_retry;
+       u32                     retry_cnt;
+       u32                     rnr_retry;
+       struct rxe_task         task;
+};
+
+enum rdatm_res_state {
+       rdatm_res_state_next,
+       rdatm_res_state_new,
+       rdatm_res_state_replay,
+};
+
+struct resp_res {
+       int                     type;
+       u32                     first_psn;
+       u32                     last_psn;
+       u32                     cur_psn;
+       enum rdatm_res_state    state;
+
+       union {
+               struct {
+                       struct sk_buff  *skb;
+               } atomic;
+               struct {
+                       struct rxe_mem  *mr;
+                       u64             va_org;
+                       u32             rkey;
+                       u32             length;
+                       u64             va;
+                       u32             resid;
+               } read;
+       };
+};
+
+struct rxe_resp_info {
+       enum rxe_qp_state       state;
+       u32                     msn;
+       u32                     psn;
+       int                     opcode;
+       int                     drop_msg;
+       int                     goto_error;
+       int                     sent_psn_nak;
+       enum ib_wc_status       status;
+       u8                      aeth_syndrome;
+
+       /* Receive only */
+       struct rxe_recv_wqe     *wqe;
+
+       /* RDMA read / atomic only */
+       u64                     va;
+       struct rxe_mem          *mr;
+       u32                     resid;
+       u32                     rkey;
+       u64                     atomic_orig;
+
+       /* SRQ only */
+       struct {
+               struct rxe_recv_wqe     wqe;
+               struct ib_sge           sge[RXE_MAX_SGE];
+       } srq_wqe;
+
+       /* Responder resources. It's a circular list where the oldest
+        * resource is dropped first.
+        */
+       struct resp_res         *resources;
+       unsigned int            res_head;
+       unsigned int            res_tail;
+       struct resp_res         *res;
+       struct rxe_task         task;
+};
+
+struct rxe_qp {
+       struct rxe_pool_entry   pelem;
+       struct ib_qp            ibqp;
+       struct ib_qp_attr       attr;
+       unsigned int            valid;
+       unsigned int            mtu;
+       int                     is_user;
+
+       struct rxe_pd           *pd;
+       struct rxe_srq          *srq;
+       struct rxe_cq           *scq;
+       struct rxe_cq           *rcq;
+
+       enum ib_sig_type        sq_sig_type;
+
+       struct rxe_sq           sq;
+       struct rxe_rq           rq;
+
+       struct socket           *sk;
+
+       struct rxe_av           pri_av;
+       struct rxe_av           alt_av;
+
+       /* list of mcast groups qp has joined (for cleanup) */
+       struct list_head        grp_list;
+       spinlock_t              grp_lock; /* guard grp_list */
+
+       struct sk_buff_head     req_pkts;
+       struct sk_buff_head     resp_pkts;
+       struct sk_buff_head     send_pkts;
+
+       struct rxe_req_info     req;
+       struct rxe_comp_info    comp;
+       struct rxe_resp_info    resp;
+
+       atomic_t                ssn;
+       atomic_t                skb_out;
+       int                     need_req_skb;
+
+       /* Timer for retranmitting packet when ACKs have been lost. RC
+        * only. The requester sets it when it is not already
+        * started. The responder resets it whenever an ack is
+        * received.
+        */
+       struct timer_list retrans_timer;
+       u64 qp_timeout_jiffies;
+
+       /* Timer for handling RNR NAKS. */
+       struct timer_list rnr_nak_timer;
+
+       spinlock_t              state_lock; /* guard requester and completer */
+};
+
+enum rxe_mem_state {
+       RXE_MEM_STATE_ZOMBIE,
+       RXE_MEM_STATE_INVALID,
+       RXE_MEM_STATE_FREE,
+       RXE_MEM_STATE_VALID,
+};
+
+enum rxe_mem_type {
+       RXE_MEM_TYPE_NONE,
+       RXE_MEM_TYPE_DMA,
+       RXE_MEM_TYPE_MR,
+       RXE_MEM_TYPE_FMR,
+       RXE_MEM_TYPE_MW,
+};
+
+#define RXE_BUF_PER_MAP                (PAGE_SIZE / sizeof(struct rxe_phys_buf))
+
+struct rxe_phys_buf {
+       u64      addr;
+       u64      size;
+};
+
+struct rxe_map {
+       struct rxe_phys_buf     buf[RXE_BUF_PER_MAP];
+};
+
+struct rxe_mem {
+       struct rxe_pool_entry   pelem;
+       union {
+               struct ib_mr            ibmr;
+               struct ib_mw            ibmw;
+       };
+
+       struct rxe_pd           *pd;
+       struct ib_umem          *umem;
+
+       u32                     lkey;
+       u32                     rkey;
+
+       enum rxe_mem_state      state;
+       enum rxe_mem_type       type;
+       u64                     va;
+       u64                     iova;
+       size_t                  length;
+       u32                     offset;
+       int                     access;
+
+       int                     page_shift;
+       int                     page_mask;
+       int                     map_shift;
+       int                     map_mask;
+
+       u32                     num_buf;
+       u32                     nbuf;
+
+       u32                     max_buf;
+       u32                     num_map;
+
+       struct rxe_map          **map;
+};
+
+struct rxe_mc_grp {
+       struct rxe_pool_entry   pelem;
+       spinlock_t              mcg_lock; /* guard group */
+       struct rxe_dev          *rxe;
+       struct list_head        qp_list;
+       union ib_gid            mgid;
+       int                     num_qp;
+       u32                     qkey;
+       u16                     pkey;
+};
+
+struct rxe_mc_elem {
+       struct rxe_pool_entry   pelem;
+       struct list_head        qp_list;
+       struct list_head        grp_list;
+       struct rxe_qp           *qp;
+       struct rxe_mc_grp       *grp;
+};
+
+struct rxe_port {
+       struct ib_port_attr     attr;
+       u16                     *pkey_tbl;
+       __be64                  port_guid;
+       __be64                  subnet_prefix;
+       spinlock_t              port_lock; /* guard port */
+       unsigned int            mtu_cap;
+       /* special QPs */
+       u32                     qp_smi_index;
+       u32                     qp_gsi_index;
+};
+
+/* callbacks from rdma_rxe to network interface layer */
+struct rxe_ifc_ops {
+       void (*release)(struct rxe_dev *rxe);
+       __be64 (*node_guid)(struct rxe_dev *rxe);
+       __be64 (*port_guid)(struct rxe_dev *rxe);
+       struct device *(*dma_device)(struct rxe_dev *rxe);
+       int (*mcast_add)(struct rxe_dev *rxe, union ib_gid *mgid);
+       int (*mcast_delete)(struct rxe_dev *rxe, union ib_gid *mgid);
+       int (*prepare)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                      struct sk_buff *skb, u32 *crc);
+       int (*send)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                   struct sk_buff *skb);
+       int (*loopback)(struct sk_buff *skb);
+       struct sk_buff *(*init_packet)(struct rxe_dev *rxe, struct rxe_av *av,
+                                      int paylen, struct rxe_pkt_info *pkt);
+       char *(*parent_name)(struct rxe_dev *rxe, unsigned int port_num);
+       enum rdma_link_layer (*link_layer)(struct rxe_dev *rxe,
+                                          unsigned int port_num);
+};
+
+struct rxe_dev {
+       struct ib_device        ib_dev;
+       struct ib_device_attr   attr;
+       int                     max_ucontext;
+       int                     max_inline_data;
+       struct kref             ref_cnt;
+       struct mutex    usdev_lock;
+
+       struct rxe_ifc_ops      *ifc_ops;
+
+       struct net_device       *ndev;
+
+       int                     xmit_errors;
+
+       struct rxe_pool         uc_pool;
+       struct rxe_pool         pd_pool;
+       struct rxe_pool         ah_pool;
+       struct rxe_pool         srq_pool;
+       struct rxe_pool         qp_pool;
+       struct rxe_pool         cq_pool;
+       struct rxe_pool         mr_pool;
+       struct rxe_pool         mw_pool;
+       struct rxe_pool         mc_grp_pool;
+       struct rxe_pool         mc_elem_pool;
+
+       spinlock_t              pending_lock; /* guard pending_mmaps */
+       struct list_head        pending_mmaps;
+
+       spinlock_t              mmap_offset_lock; /* guard mmap_offset */
+       int                     mmap_offset;
+
+       struct rxe_port         port;
+       struct list_head        list;
+};
+
+static inline struct rxe_dev *to_rdev(struct ib_device *dev)
+{
+       return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;
+}
+
+static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc)
+{
+       return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL;
+}
+
+static inline struct rxe_pd *to_rpd(struct ib_pd *pd)
+{
+       return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL;
+}
+
+static inline struct rxe_ah *to_rah(struct ib_ah *ah)
+{
+       return ah ? container_of(ah, struct rxe_ah, ibah) : NULL;
+}
+
+static inline struct rxe_srq *to_rsrq(struct ib_srq *srq)
+{
+       return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL;
+}
+
+static inline struct rxe_qp *to_rqp(struct ib_qp *qp)
+{
+       return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL;
+}
+
+static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
+{
+       return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
+}
+
+static inline struct rxe_mem *to_rmr(struct ib_mr *mr)
+{
+       return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL;
+}
+
+static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
+{
+       return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
+}
+
+int rxe_register_device(struct rxe_dev *rxe);
+int rxe_unregister_device(struct rxe_dev *rxe);
+
+void rxe_mc_cleanup(void *arg);
+
+#endif /* RXE_VERBS_H */
index 1502199..7b6d40f 100644 (file)
@@ -62,10 +62,8 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
 {
        struct ipoib_dev_priv *priv = netdev_priv(netdev);
 
-       snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-                "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32),
-                (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff,
-                (int)priv->ca->attrs.fw_ver & 0xffff);
+       ib_get_device_fw_str(priv->ca, drvinfo->fw_version,
+                            sizeof(drvinfo->fw_version));
 
        strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
                sizeof(drvinfo->bus_info));
index 5f58c41..74bcaa0 100644 (file)
@@ -1967,8 +1967,7 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
        priv->hca_caps = hca->attrs.device_cap_flags;
 
        if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
-               priv->dev->hw_features = NETIF_F_SG |
-                       NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
+               priv->dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
 
                if (priv->hca_caps & IB_DEVICE_UD_TSO)
                        priv->dev->hw_features |= NETIF_F_TSO;
index 1e7cbba..c55ecb2 100644 (file)
@@ -135,7 +135,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
                .cap = {
                        .max_send_wr  = ipoib_sendq_size,
                        .max_recv_wr  = ipoib_recvq_size,
-                       .max_send_sge = 1,
+                       .max_send_sge = min_t(u32, priv->ca->attrs.max_sge,
+                                             MAX_SKB_FRAGS + 1),
                        .max_recv_sge = IPOIB_UD_RX_SG
                },
                .sq_sig_type = IB_SIGNAL_ALL_WR,
@@ -205,10 +206,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
        if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING)
                init_attr.create_flags |= IB_QP_CREATE_NETIF_QP;
 
-       if (dev->features & NETIF_F_SG)
-               init_attr.cap.max_send_sge =
-                       min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1);
-
        priv->qp = ib_create_qp(priv->pd, &init_attr);
        if (IS_ERR(priv->qp)) {
                printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
@@ -234,6 +231,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
        priv->rx_wr.next = NULL;
        priv->rx_wr.sg_list = priv->rx_sge;
 
+       if (init_attr.cap.max_send_sge > 1)
+               dev->features |= NETIF_F_SG;
+
        priv->max_send_sge = init_attr.cap.max_send_sge;
 
        return 0;
index 4705e2d..e0ebe13 100644 (file)
@@ -104,6 +104,8 @@ enum {
 
 enum CPL_error {
        CPL_ERR_NONE               = 0,
+       CPL_ERR_TCAM_PARITY        = 1,
+       CPL_ERR_TCAM_MISS          = 2,
        CPL_ERR_TCAM_FULL          = 3,
        CPL_ERR_BAD_LENGTH         = 15,
        CPL_ERR_BAD_ROUTE          = 18,
index f4497cf..d728704 100644 (file)
@@ -721,6 +721,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET         0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET                0xa0
 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET                0x9c
+#define QUERY_DEV_CAP_DIAG_RPRT_PER_PORT       0x9c
 #define QUERY_DEV_CAP_FW_REASSIGN_MAC          0x9d
 #define QUERY_DEV_CAP_VXLAN                    0x9e
 #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET         0xb0
@@ -935,6 +936,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
                dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
        if (field32 & (1 << 7))
                dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
+       MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT);
+       if (field32 & (1 << 17))
+               dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT;
        MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
        if (field & 1<<6)
                dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
@@ -2457,6 +2461,42 @@ int mlx4_NOP(struct mlx4_dev *dev)
                        MLX4_CMD_NATIVE);
 }
 
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+                            const u32 offset[],
+                            u32 value[], size_t array_len, u8 port)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       u32 *outbox;
+       size_t i;
+       int ret;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+
+       outbox = mailbox->buf;
+
+       ret = mlx4_cmd_box(dev, 0, mailbox->dma, port, op_modifier,
+                          MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A,
+                          MLX4_CMD_NATIVE);
+       if (ret)
+               goto out;
+
+       for (i = 0; i < array_len; i++) {
+               if (offset[i] > MLX4_MAILBOX_SIZE) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               MLX4_GET(value[i], outbox, offset[i]);
+       }
+
+out:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return ret;
+}
+EXPORT_SYMBOL(mlx4_query_diag_counters);
+
 int mlx4_get_phys_port_id(struct mlx4_dev *dev)
 {
        u8 port;
index 04bc522..c07f4d0 100644 (file)
@@ -63,12 +63,12 @@ void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
                complete(&srq->free);
 }
 
-static int get_pas_size(void *srqc)
+static int get_pas_size(struct mlx5_srq_attr *in)
 {
-       u32 log_page_size = MLX5_GET(srqc, srqc, log_page_size) + 12;
-       u32 log_srq_size  = MLX5_GET(srqc, srqc, log_srq_size);
-       u32 log_rq_stride = MLX5_GET(srqc, srqc, log_rq_stride);
-       u32 page_offset   = MLX5_GET(srqc, srqc, page_offset);
+       u32 log_page_size = in->log_page_size + 12;
+       u32 log_srq_size  = in->log_size;
+       u32 log_rq_stride = in->wqe_shift;
+       u32 page_offset   = in->page_offset;
        u32 po_quanta     = 1 << (log_page_size - 6);
        u32 rq_sz         = 1 << (log_srq_size + 4 + log_rq_stride);
        u32 page_size     = 1 << log_page_size;
@@ -78,57 +78,58 @@ static int get_pas_size(void *srqc)
        return rq_num_pas * sizeof(u64);
 }
 
-static void rmpc_srqc_reformat(void *srqc, void *rmpc, bool srqc_to_rmpc)
+static void set_wq(void *wq, struct mlx5_srq_attr *in)
 {
-       void *wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
-
-       if (srqc_to_rmpc) {
-               switch (MLX5_GET(srqc, srqc, state)) {
-               case MLX5_SRQC_STATE_GOOD:
-                       MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
-                       break;
-               case MLX5_SRQC_STATE_ERROR:
-                       MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_ERR);
-                       break;
-               default:
-                       pr_warn("%s: %d: Unknown srq state = 0x%x\n", __func__,
-                               __LINE__, MLX5_GET(srqc, srqc, state));
-                       MLX5_SET(rmpc, rmpc, state, MLX5_GET(srqc, srqc, state));
-               }
-
-               MLX5_SET(wq,   wq, wq_signature,  MLX5_GET(srqc,  srqc, wq_signature));
-               MLX5_SET(wq,   wq, log_wq_pg_sz,  MLX5_GET(srqc,  srqc, log_page_size));
-               MLX5_SET(wq,   wq, log_wq_stride, MLX5_GET(srqc,  srqc, log_rq_stride) + 4);
-               MLX5_SET(wq,   wq, log_wq_sz,     MLX5_GET(srqc,  srqc, log_srq_size));
-               MLX5_SET(wq,   wq, page_offset,   MLX5_GET(srqc,  srqc, page_offset));
-               MLX5_SET(wq,   wq, lwm,           MLX5_GET(srqc,  srqc, lwm));
-               MLX5_SET(wq,   wq, pd,            MLX5_GET(srqc,  srqc, pd));
-               MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(srqc,     srqc, dbr_addr));
-       } else {
-               switch (MLX5_GET(rmpc, rmpc, state)) {
-               case MLX5_RMPC_STATE_RDY:
-                       MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_GOOD);
-                       break;
-               case MLX5_RMPC_STATE_ERR:
-                       MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_ERROR);
-                       break;
-               default:
-                       pr_warn("%s: %d: Unknown rmp state = 0x%x\n",
-                               __func__, __LINE__,
-                               MLX5_GET(rmpc, rmpc, state));
-                       MLX5_SET(srqc, srqc, state,
-                                MLX5_GET(rmpc, rmpc, state));
-               }
-
-               MLX5_SET(srqc,   srqc, wq_signature,   MLX5_GET(wq,   wq, wq_signature));
-               MLX5_SET(srqc,   srqc, log_page_size,  MLX5_GET(wq,   wq, log_wq_pg_sz));
-               MLX5_SET(srqc,   srqc, log_rq_stride,  MLX5_GET(wq,   wq, log_wq_stride) - 4);
-               MLX5_SET(srqc,   srqc, log_srq_size,   MLX5_GET(wq,   wq, log_wq_sz));
-               MLX5_SET(srqc,   srqc, page_offset,    MLX5_GET(wq,   wq, page_offset));
-               MLX5_SET(srqc,   srqc, lwm,            MLX5_GET(wq,   wq, lwm));
-               MLX5_SET(srqc,   srqc, pd,             MLX5_GET(wq,   wq, pd));
-               MLX5_SET64(srqc, srqc, dbr_addr,       MLX5_GET64(wq, wq, dbr_addr));
-       }
+       MLX5_SET(wq,   wq, wq_signature,  !!(in->flags
+                & MLX5_SRQ_FLAG_WQ_SIG));
+       MLX5_SET(wq,   wq, log_wq_pg_sz,  in->log_page_size);
+       MLX5_SET(wq,   wq, log_wq_stride, in->wqe_shift + 4);
+       MLX5_SET(wq,   wq, log_wq_sz,     in->log_size);
+       MLX5_SET(wq,   wq, page_offset,   in->page_offset);
+       MLX5_SET(wq,   wq, lwm,           in->lwm);
+       MLX5_SET(wq,   wq, pd,            in->pd);
+       MLX5_SET64(wq, wq, dbr_addr,      in->db_record);
+}
+
+static void set_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+       MLX5_SET(srqc,   srqc, wq_signature,  !!(in->flags
+                & MLX5_SRQ_FLAG_WQ_SIG));
+       MLX5_SET(srqc,   srqc, log_page_size, in->log_page_size);
+       MLX5_SET(srqc,   srqc, log_rq_stride, in->wqe_shift);
+       MLX5_SET(srqc,   srqc, log_srq_size,  in->log_size);
+       MLX5_SET(srqc,   srqc, page_offset,   in->page_offset);
+       MLX5_SET(srqc,   srqc, lwm,           in->lwm);
+       MLX5_SET(srqc,   srqc, pd,            in->pd);
+       MLX5_SET64(srqc, srqc, dbr_addr,      in->db_record);
+       MLX5_SET(srqc,   srqc, xrcd,          in->xrcd);
+       MLX5_SET(srqc,   srqc, cqn,           in->cqn);
+}
+
+static void get_wq(void *wq, struct mlx5_srq_attr *in)
+{
+       if (MLX5_GET(wq, wq, wq_signature))
+               in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+       in->log_page_size = MLX5_GET(wq,   wq, log_wq_pg_sz);
+       in->wqe_shift     = MLX5_GET(wq,   wq, log_wq_stride) - 4;
+       in->log_size      = MLX5_GET(wq,   wq, log_wq_sz);
+       in->page_offset   = MLX5_GET(wq,   wq, page_offset);
+       in->lwm           = MLX5_GET(wq,   wq, lwm);
+       in->pd            = MLX5_GET(wq,   wq, pd);
+       in->db_record     = MLX5_GET64(wq, wq, dbr_addr);
+}
+
+static void get_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+       if (MLX5_GET(srqc, srqc, wq_signature))
+               in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+       in->log_page_size = MLX5_GET(srqc,   srqc, log_page_size);
+       in->wqe_shift     = MLX5_GET(srqc,   srqc, log_rq_stride);
+       in->log_size      = MLX5_GET(srqc,   srqc, log_srq_size);
+       in->page_offset   = MLX5_GET(srqc,   srqc, page_offset);
+       in->lwm           = MLX5_GET(srqc,   srqc, lwm);
+       in->pd            = MLX5_GET(srqc,   srqc, pd);
+       in->db_record     = MLX5_GET64(srqc, srqc, dbr_addr);
 }
 
 struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
@@ -149,19 +150,36 @@ struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
 EXPORT_SYMBOL(mlx5_core_get_srq);
 
 static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                         struct mlx5_create_srq_mbox_in *in, int inlen)
+                         struct mlx5_srq_attr *in)
 {
-       struct mlx5_create_srq_mbox_out out;
+       u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0};
+       void *create_in;
+       void *srqc;
+       void *pas;
+       int pas_size;
+       int inlen;
        int err;
 
-       memset(&out, 0, sizeof(out));
+       pas_size  = get_pas_size(in);
+       inlen     = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size;
+       create_in = mlx5_vzalloc(inlen);
+       if (!create_in)
+               return -ENOMEM;
+
+       srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry);
+       pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
 
-       in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_SRQ);
+       set_srqc(srqc, in);
+       memcpy(pas, in->pas, pas_size);
 
-       err = mlx5_cmd_exec_check_status(dev, (u32 *)in, inlen, (u32 *)(&out),
-                                        sizeof(out));
+       MLX5_SET(create_srq_in, create_in, opcode,
+                MLX5_CMD_OP_CREATE_SRQ);
 
-       srq->srqn = be32_to_cpu(out.srqn) & 0xffffff;
+       err = mlx5_cmd_exec_check_status(dev, create_in, inlen, create_out,
+                                        sizeof(create_out));
+       kvfree(create_in);
+       if (!err)
+               srq->srqn = MLX5_GET(create_srq_out, create_out, srqn);
 
        return err;
 }
@@ -169,67 +187,75 @@ static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 static int destroy_srq_cmd(struct mlx5_core_dev *dev,
                           struct mlx5_core_srq *srq)
 {
-       struct mlx5_destroy_srq_mbox_in in;
-       struct mlx5_destroy_srq_mbox_out out;
+       u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0};
+       u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0};
 
-       memset(&in, 0, sizeof(in));
-       memset(&out, 0, sizeof(out));
-       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
-       in.srqn = cpu_to_be32(srq->srqn);
+       MLX5_SET(destroy_srq_in, srq_in, opcode,
+                MLX5_CMD_OP_DESTROY_SRQ);
+       MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
 
-       return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in),
-                                         (u32 *)(&out), sizeof(out));
+       return mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in),
+                                         srq_out, sizeof(srq_out));
 }
 
 static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
                       u16 lwm, int is_srq)
 {
-       struct mlx5_arm_srq_mbox_in     in;
-       struct mlx5_arm_srq_mbox_out    out;
-
-       memset(&in, 0, sizeof(in));
-       memset(&out, 0, sizeof(out));
+       /* arm_srq structs missing using identical xrc ones */
+       u32 srq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0};
+       u32 srq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
 
-       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ);
-       in.hdr.opmod = cpu_to_be16(!!is_srq);
-       in.srqn = cpu_to_be32(srq->srqn);
-       in.lwm = cpu_to_be16(lwm);
+       MLX5_SET(arm_xrc_srq_in, srq_in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+       MLX5_SET(arm_xrc_srq_in, srq_in, xrc_srqn, srq->srqn);
+       MLX5_SET(arm_xrc_srq_in, srq_in, lwm,      lwm);
 
-       return mlx5_cmd_exec_check_status(dev, (u32 *)(&in),
-                                         sizeof(in), (u32 *)(&out),
-                                         sizeof(out));
+       return  mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in),
+                                          srq_out, sizeof(srq_out));
 }
 
 static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_query_srq_mbox_out *out)
+                        struct mlx5_srq_attr *out)
 {
-       struct mlx5_query_srq_mbox_in in;
+       u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0};
+       u32 *srq_out;
+       void *srqc;
+       int err;
 
-       memset(&in, 0, sizeof(in));
+       srq_out = mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_srq_out));
+       if (!srq_out)
+               return -ENOMEM;
 
-       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ);
-       in.srqn = cpu_to_be32(srq->srqn);
+       MLX5_SET(query_srq_in, srq_in, opcode,
+                MLX5_CMD_OP_QUERY_SRQ);
+       MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn);
+       err =  mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in),
+                                         srq_out,
+                                         MLX5_ST_SZ_BYTES(query_srq_out));
+       if (err)
+               goto out;
 
-       return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in),
-                                         (u32 *)out, sizeof(*out));
+       srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry);
+       get_srqc(srqc, out);
+       if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD)
+               out->flags |= MLX5_SRQ_FLAG_ERR;
+out:
+       kvfree(srq_out);
+       return err;
 }
 
 static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
                              struct mlx5_core_srq *srq,
-                             struct mlx5_create_srq_mbox_in *in,
-                             int srq_inlen)
+                             struct mlx5_srq_attr *in)
 {
        u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
        void *create_in;
-       void *srqc;
        void *xrc_srqc;
        void *pas;
        int pas_size;
        int inlen;
        int err;
 
-       srqc      = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry);
-       pas_size  = get_pas_size(srqc);
+       pas_size  = get_pas_size(in);
        inlen     = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
        create_in = mlx5_vzalloc(inlen);
        if (!create_in)
@@ -239,7 +265,8 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
                                xrc_srq_context_entry);
        pas      = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
 
-       memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc));
+       set_srqc(xrc_srqc, in);
+       MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index);
        memcpy(pas, in->pas, pas_size);
        MLX5_SET(create_xrc_srq_in, create_in, opcode,
                 MLX5_CMD_OP_CREATE_XRC_SRQ);
@@ -293,11 +320,10 @@ static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev,
 
 static int query_xrc_srq_cmd(struct mlx5_core_dev *dev,
                             struct mlx5_core_srq *srq,
-                            struct mlx5_query_srq_mbox_out *out)
+                            struct mlx5_srq_attr *out)
 {
        u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
        u32 *xrcsrq_out;
-       void *srqc;
        void *xrc_srqc;
        int err;
 
@@ -317,8 +343,9 @@ static int query_xrc_srq_cmd(struct mlx5_core_dev *dev,
 
        xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out,
                                xrc_srq_context_entry);
-       srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry);
-       memcpy(srqc, xrc_srqc, MLX5_ST_SZ_BYTES(srqc));
+       get_srqc(xrc_srqc, out);
+       if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD)
+               out->flags |= MLX5_SRQ_FLAG_ERR;
 
 out:
        kvfree(xrcsrq_out);
@@ -326,26 +353,27 @@ out:
 }
 
 static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                         struct mlx5_create_srq_mbox_in *in, int srq_inlen)
+                         struct mlx5_srq_attr *in)
 {
        void *create_in;
        void *rmpc;
-       void *srqc;
+       void *wq;
        int pas_size;
        int inlen;
        int err;
 
-       srqc = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry);
-       pas_size = get_pas_size(srqc);
+       pas_size = get_pas_size(in);
        inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
        create_in = mlx5_vzalloc(inlen);
        if (!create_in)
                return -ENOMEM;
 
        rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+       wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
 
+       MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+       set_wq(wq, in);
        memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
-       rmpc_srqc_reformat(srqc, rmpc, true);
 
        err = mlx5_core_create_rmp(dev, create_in, inlen, &srq->srqn);
 
@@ -390,11 +418,10 @@ static int arm_rmp_cmd(struct mlx5_core_dev *dev,
 }
 
 static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_query_srq_mbox_out *out)
+                        struct mlx5_srq_attr *out)
 {
        u32 *rmp_out;
        void *rmpc;
-       void *srqc;
        int err;
 
        rmp_out =  mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_rmp_out));
@@ -405,9 +432,10 @@ static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
        if (err)
                goto out;
 
-       srqc = MLX5_ADDR_OF(query_srq_out, out,     srq_context_entry);
        rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context);
-       rmpc_srqc_reformat(srqc, rmpc, false);
+       get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out);
+       if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY)
+               out->flags |= MLX5_SRQ_FLAG_ERR;
 
 out:
        kvfree(rmp_out);
@@ -416,15 +444,14 @@ out:
 
 static int create_srq_split(struct mlx5_core_dev *dev,
                            struct mlx5_core_srq *srq,
-                           struct mlx5_create_srq_mbox_in *in,
-                           int inlen, int is_xrc)
+                           struct mlx5_srq_attr *in)
 {
        if (!dev->issi)
-               return create_srq_cmd(dev, srq, in, inlen);
+               return create_srq_cmd(dev, srq, in);
        else if (srq->common.res == MLX5_RES_XSRQ)
-               return create_xrc_srq_cmd(dev, srq, in, inlen);
+               return create_xrc_srq_cmd(dev, srq, in);
        else
-               return create_rmp_cmd(dev, srq, in, inlen);
+               return create_rmp_cmd(dev, srq, in);
 }
 
 static int destroy_srq_split(struct mlx5_core_dev *dev,
@@ -439,15 +466,17 @@ static int destroy_srq_split(struct mlx5_core_dev *dev,
 }
 
 int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_create_srq_mbox_in *in, int inlen,
-                        int is_xrc)
+                        struct mlx5_srq_attr *in)
 {
        int err;
        struct mlx5_srq_table *table = &dev->priv.srq_table;
 
-       srq->common.res = is_xrc ? MLX5_RES_XSRQ : MLX5_RES_SRQ;
+       if (in->type == IB_SRQT_XRC)
+               srq->common.res = MLX5_RES_XSRQ;
+       else
+               srq->common.res = MLX5_RES_SRQ;
 
-       err = create_srq_split(dev, srq, in, inlen, is_xrc);
+       err = create_srq_split(dev, srq, in);
        if (err)
                return err;
 
@@ -502,7 +531,7 @@ int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
 EXPORT_SYMBOL(mlx5_core_destroy_srq);
 
 int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                       struct mlx5_query_srq_mbox_out *out)
+                       struct mlx5_srq_attr *out)
 {
        if (!dev->issi)
                return query_srq_cmd(dev, srq, out);
index 03a5093..28274a6 100644 (file)
@@ -85,6 +85,7 @@ int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
 
        return err;
 }
+EXPORT_SYMBOL(mlx5_core_create_rq);
 
 int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
 {
@@ -110,6 +111,7 @@ void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
 
        mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_destroy_rq);
 
 int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out)
 {
@@ -430,6 +432,7 @@ int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
 
        return err;
 }
+EXPORT_SYMBOL(mlx5_core_create_rqt);
 
 int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
                         int inlen)
@@ -455,3 +458,4 @@ void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn)
 
        mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_destroy_rqt);
index e6f6910..42da355 100644 (file)
@@ -220,6 +220,7 @@ enum {
        MLX4_DEV_CAP_FLAG2_LB_SRC_CHK           = 1ULL << 32,
        MLX4_DEV_CAP_FLAG2_ROCE_V1_V2           = 1ULL <<  33,
        MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER   = 1ULL <<  34,
+       MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT        = 1ULL <<  35,
 };
 
 enum {
@@ -1342,6 +1343,9 @@ enum {
        VXLAN_STEER_BY_INNER_VLAN       = 1 << 4,
 };
 
+enum {
+       MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS = 0x2,
+};
 
 int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
                                enum mlx4_net_trans_promisc_mode mode);
@@ -1382,6 +1386,9 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
 int mlx4_test_interrupts(struct mlx4_dev *dev);
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+                            const u32 offset[], u32 value[],
+                            size_t array_len, u8 port);
 u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port);
 bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector);
 struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port);
index 2be976d..2566f6d 100644 (file)
@@ -58,6 +58,8 @@ struct mlx5_core_cq {
                void (*comp)(struct mlx5_core_cq *);
                void            *priv;
        } tasklet_ctx;
+       int                     reset_notify_added;
+       struct list_head        reset_notify;
 };
 
 
index a041b99..ccea6fb 100644 (file)
@@ -46,6 +46,7 @@
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
+#include <linux/mlx5/srq.h>
 
 enum {
        MLX5_RQ_BITMASK_VSD = 1 << 1,
@@ -798,11 +799,10 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
                                 struct mlx5_cmd_mailbox *head);
 int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_create_srq_mbox_in *in, int inlen,
-                        int is_xrc);
+                        struct mlx5_srq_attr *in);
 int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq);
 int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                       struct mlx5_query_srq_mbox_out *out);
+                       struct mlx5_srq_attr *out);
 int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
                      u16 lwm, int is_srq);
 void mlx5_init_mkey_table(struct mlx5_core_dev *dev);
index ab31081..7879bf4 100644 (file)
@@ -556,9 +556,9 @@ struct mlx5_destroy_qp_mbox_out {
 struct mlx5_modify_qp_mbox_in {
        struct mlx5_inbox_hdr   hdr;
        __be32                  qpn;
-       u8                      rsvd1[4];
-       __be32                  optparam;
        u8                      rsvd0[4];
+       __be32                  optparam;
+       u8                      rsvd1[4];
        struct mlx5_qp_context  ctx;
        u8                      rsvd2[16];
 };
index f43ed05..33c97dc 100644 (file)
 
 #include <linux/mlx5/driver.h>
 
+enum {
+       MLX5_SRQ_FLAG_ERR    = (1 << 0),
+       MLX5_SRQ_FLAG_WQ_SIG = (1 << 1),
+};
+
+struct mlx5_srq_attr {
+       u32 type;
+       u32 flags;
+       u32 log_size;
+       u32 wqe_shift;
+       u32 log_page_size;
+       u32 wqe_cnt;
+       u32 srqn;
+       u32 xrcd;
+       u32 page_offset;
+       u32 cqn;
+       u32 pd;
+       u32 lwm;
+       u32 user_index;
+       u64 db_record;
+       u64 *pas;
+};
+
+struct mlx5_core_dev;
+
 void mlx5_init_srq_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev);
 
index 3840416..5ee7aab 100644 (file)
@@ -94,6 +94,19 @@ enum ib_sa_selector {
        IB_SA_BEST = 3
 };
 
+/*
+ * There are 4 types of join states:
+ * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
+ * The order corresponds to JoinState bits in MCMemberRecord.
+ */
+enum ib_sa_mc_join_states {
+       FULLMEMBER_JOIN,
+       NONMEMBER_JOIN,
+       SENDONLY_NONMEBER_JOIN,
+       SENDONLY_FULLMEMBER_JOIN,
+       NUM_JOIN_MEMBERSHIP_TYPES,
+};
+
 #define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT      BIT(12)
 
 /*
index a8137dc..94a0bc5 100644 (file)
@@ -562,6 +562,7 @@ enum ib_event_type {
        IB_EVENT_QP_LAST_WQE_REACHED,
        IB_EVENT_CLIENT_REREGISTER,
        IB_EVENT_GID_CHANGE,
+       IB_EVENT_WQ_FATAL,
 };
 
 const char *__attribute_const__ ib_event_msg(enum ib_event_type event);
@@ -572,6 +573,7 @@ struct ib_event {
                struct ib_cq    *cq;
                struct ib_qp    *qp;
                struct ib_srq   *srq;
+               struct ib_wq    *wq;
                u8              port_num;
        } element;
        enum ib_event_type      event;
@@ -1015,6 +1017,7 @@ struct ib_qp_init_attr {
         * Only needed for special QP types, or when using the RW API.
         */
        u8                      port_num;
+       struct ib_rwq_ind_table *rwq_ind_tbl;
 };
 
 struct ib_qp_open_attr {
@@ -1323,6 +1326,8 @@ struct ib_ucontext {
        struct list_head        ah_list;
        struct list_head        xrcd_list;
        struct list_head        rule_list;
+       struct list_head        wq_list;
+       struct list_head        rwq_ind_tbl_list;
        int                     closing;
 
        struct pid             *tgid;
@@ -1428,6 +1433,63 @@ struct ib_srq {
        } ext;
 };
 
+enum ib_wq_type {
+       IB_WQT_RQ
+};
+
+enum ib_wq_state {
+       IB_WQS_RESET,
+       IB_WQS_RDY,
+       IB_WQS_ERR
+};
+
+struct ib_wq {
+       struct ib_device       *device;
+       struct ib_uobject      *uobject;
+       void                *wq_context;
+       void                (*event_handler)(struct ib_event *, void *);
+       struct ib_pd           *pd;
+       struct ib_cq           *cq;
+       u32             wq_num;
+       enum ib_wq_state       state;
+       enum ib_wq_type wq_type;
+       atomic_t                usecnt;
+};
+
+struct ib_wq_init_attr {
+       void                   *wq_context;
+       enum ib_wq_type wq_type;
+       u32             max_wr;
+       u32             max_sge;
+       struct  ib_cq          *cq;
+       void                (*event_handler)(struct ib_event *, void *);
+};
+
+enum ib_wq_attr_mask {
+       IB_WQ_STATE     = 1 << 0,
+       IB_WQ_CUR_STATE = 1 << 1,
+};
+
+struct ib_wq_attr {
+       enum    ib_wq_state     wq_state;
+       enum    ib_wq_state     curr_wq_state;
+};
+
+struct ib_rwq_ind_table {
+       struct ib_device        *device;
+       struct ib_uobject      *uobject;
+       atomic_t                usecnt;
+       u32             ind_tbl_num;
+       u32             log_ind_tbl_size;
+       struct ib_wq    **ind_tbl;
+};
+
+struct ib_rwq_ind_table_init_attr {
+       u32             log_ind_tbl_size;
+       /* Each entry is a pointer to Receive Work Queue */
+       struct ib_wq    **ind_tbl;
+};
+
 struct ib_qp {
        struct ib_device       *device;
        struct ib_pd           *pd;
@@ -1450,6 +1512,7 @@ struct ib_qp {
        void                   *qp_context;
        u32                     qp_num;
        enum ib_qp_type         qp_type;
+       struct ib_rwq_ind_table *rwq_ind_tbl;
 };
 
 struct ib_mr {
@@ -1506,6 +1569,7 @@ enum ib_flow_spec_type {
        IB_FLOW_SPEC_IB         = 0x22,
        /* L3 header*/
        IB_FLOW_SPEC_IPV4       = 0x30,
+       IB_FLOW_SPEC_IPV6       = 0x31,
        /* L4 headers*/
        IB_FLOW_SPEC_TCP        = 0x40,
        IB_FLOW_SPEC_UDP        = 0x41
@@ -1567,6 +1631,18 @@ struct ib_flow_spec_ipv4 {
        struct ib_flow_ipv4_filter mask;
 };
 
+struct ib_flow_ipv6_filter {
+       u8      src_ip[16];
+       u8      dst_ip[16];
+};
+
+struct ib_flow_spec_ipv6 {
+       enum ib_flow_spec_type     type;
+       u16                        size;
+       struct ib_flow_ipv6_filter val;
+       struct ib_flow_ipv6_filter mask;
+};
+
 struct ib_flow_tcp_udp_filter {
        __be16  dst_port;
        __be16  src_port;
@@ -1588,6 +1664,7 @@ union ib_flow_spec {
        struct ib_flow_spec_ib          ib;
        struct ib_flow_spec_ipv4        ipv4;
        struct ib_flow_spec_tcp_udp     tcp_udp;
+       struct ib_flow_spec_ipv6        ipv6;
 };
 
 struct ib_flow_attr {
@@ -1921,7 +1998,18 @@ struct ib_device {
                                                   struct ifla_vf_stats *stats);
        int                        (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid,
                                                  int type);
-
+       struct ib_wq *             (*create_wq)(struct ib_pd *pd,
+                                               struct ib_wq_init_attr *init_attr,
+                                               struct ib_udata *udata);
+       int                        (*destroy_wq)(struct ib_wq *wq);
+       int                        (*modify_wq)(struct ib_wq *wq,
+                                               struct ib_wq_attr *attr,
+                                               u32 wq_attr_mask,
+                                               struct ib_udata *udata);
+       struct ib_rwq_ind_table *  (*create_rwq_ind_table)(struct ib_device *device,
+                                                          struct ib_rwq_ind_table_init_attr *init_attr,
+                                                          struct ib_udata *udata);
+       int                        (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table);
        struct ib_dma_mapping_ops   *dma_ops;
 
        struct module               *owner;
@@ -1956,6 +2044,7 @@ struct ib_device {
         * in fast paths.
         */
        int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
+       void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len);
 };
 
 struct ib_client {
@@ -1991,6 +2080,8 @@ struct ib_client {
 struct ib_device *ib_alloc_device(size_t size);
 void ib_dealloc_device(struct ib_device *device);
 
+void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len);
+
 int ib_register_device(struct ib_device *device,
                       int (*port_callback)(struct ib_device *,
                                            u8, struct kobject *));
@@ -3168,6 +3259,15 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
                                            u16 pkey, const union ib_gid *gid,
                                            const struct sockaddr *addr);
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+                          struct ib_wq_init_attr *init_attr);
+int ib_destroy_wq(struct ib_wq *wq);
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr,
+                u32 wq_attr_mask);
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+                                                struct ib_rwq_ind_table_init_attr*
+                                                wq_ind_table_init_attr);
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 
 int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
                 unsigned int *sg_offset, unsigned int page_size);
index afe44fd..81fb1d1 100644 (file)
@@ -333,11 +333,13 @@ int rdma_disconnect(struct rdma_cm_id *id);
  *   address.
  * @id: Communication identifier associated with the request.
  * @addr: Multicast address identifying the group to join.
+ * @join_state: Multicast JoinState bitmap requested by port.
+ *             Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits.
  * @context: User-defined context associated with the join request, returned
  * to the user through the private_data pointer in multicast events.
  */
 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
-                       void *context);
+                       u8 join_state, void *context);
 
 /**
  * rdma_leave_multicast - Leave the multicast group specified by the given
index 231901b..4edb0f2 100644 (file)
@@ -6,3 +6,4 @@ header-y += ib_user_verbs.h
 header-y += rdma_netlink.h
 header-y += rdma_user_cm.h
 header-y += hfi/
+header-y += rdma_user_rxe.h
index b6543d7..7f035f4 100644 (file)
@@ -95,6 +95,11 @@ enum {
        IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP,
        IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
        IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
+       IB_USER_VERBS_EX_CMD_CREATE_WQ,
+       IB_USER_VERBS_EX_CMD_MODIFY_WQ,
+       IB_USER_VERBS_EX_CMD_DESTROY_WQ,
+       IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL,
+       IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL
 };
 
 /*
@@ -518,6 +523,14 @@ struct ib_uverbs_create_qp {
        __u64 driver_data[0];
 };
 
+enum ib_uverbs_create_qp_mask {
+       IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0,
+};
+
+enum {
+       IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE,
+};
+
 struct ib_uverbs_ex_create_qp {
        __u64 user_handle;
        __u32 pd_handle;
@@ -535,6 +548,8 @@ struct ib_uverbs_ex_create_qp {
        __u8 reserved;
        __u32 comp_mask;
        __u32 create_flags;
+       __u32 rwq_ind_tbl_handle;
+       __u32  reserved1;
 };
 
 struct ib_uverbs_open_qp {
@@ -852,6 +867,24 @@ struct ib_uverbs_flow_spec_tcp_udp {
        struct ib_uverbs_flow_tcp_udp_filter mask;
 };
 
+struct ib_uverbs_flow_ipv6_filter {
+       __u8 src_ip[16];
+       __u8 dst_ip[16];
+};
+
+struct ib_uverbs_flow_spec_ipv6 {
+       union {
+               struct ib_uverbs_flow_spec_hdr hdr;
+               struct {
+                       __u32 type;
+                       __u16 size;
+                       __u16 reserved;
+               };
+       };
+       struct ib_uverbs_flow_ipv6_filter val;
+       struct ib_uverbs_flow_ipv6_filter mask;
+};
+
 struct ib_uverbs_flow_attr {
        __u32 type;
        __u16 size;
@@ -946,4 +979,66 @@ struct ib_uverbs_destroy_srq_resp {
        __u32 events_reported;
 };
 
+struct ib_uverbs_ex_create_wq  {
+       __u32 comp_mask;
+       __u32 wq_type;
+       __u64 user_handle;
+       __u32 pd_handle;
+       __u32 cq_handle;
+       __u32 max_wr;
+       __u32 max_sge;
+};
+
+struct ib_uverbs_ex_create_wq_resp {
+       __u32 comp_mask;
+       __u32 response_length;
+       __u32 wq_handle;
+       __u32 max_wr;
+       __u32 max_sge;
+       __u32 wqn;
+};
+
+struct ib_uverbs_ex_destroy_wq  {
+       __u32 comp_mask;
+       __u32 wq_handle;
+};
+
+struct ib_uverbs_ex_destroy_wq_resp {
+       __u32 comp_mask;
+       __u32 response_length;
+       __u32 events_reported;
+       __u32 reserved;
+};
+
+struct ib_uverbs_ex_modify_wq  {
+       __u32 attr_mask;
+       __u32 wq_handle;
+       __u32 wq_state;
+       __u32 curr_wq_state;
+};
+
+/* Prevent memory allocation rather than max expected size */
+#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d
+struct ib_uverbs_ex_create_rwq_ind_table  {
+       __u32 comp_mask;
+       __u32 log_ind_tbl_size;
+       /* Following are the wq handles according to log_ind_tbl_size
+        * wq_handle1
+        * wq_handle2
+        */
+       __u32 wq_handles[0];
+};
+
+struct ib_uverbs_ex_create_rwq_ind_table_resp {
+       __u32 comp_mask;
+       __u32 response_length;
+       __u32 ind_tbl_handle;
+       __u32 ind_tbl_num;
+};
+
+struct ib_uverbs_ex_destroy_rwq_ind_table  {
+       __u32 comp_mask;
+       __u32 ind_tbl_handle;
+};
+
 #endif /* IB_USER_VERBS_H */
index 3066718..01923d4 100644 (file)
@@ -244,12 +244,19 @@ struct rdma_ucm_join_ip_mcast {
        __u32 id;
 };
 
+/* Multicast join flags */
+enum {
+       RDMA_MC_JOIN_FLAG_FULLMEMBER,
+       RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER,
+       RDMA_MC_JOIN_FLAG_RESERVED,
+};
+
 struct rdma_ucm_join_mcast {
        __u64 response;         /* rdma_ucma_create_id_resp */
        __u64 uid;
        __u32 id;
        __u16 addr_size;
-       __u16 reserved;
+       __u16 join_flags;
        struct sockaddr_storage addr;
 };
 
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h
new file mode 100644 (file)
index 0000000..1de99cf
--- /dev/null
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_RXE_H
+#define RDMA_USER_RXE_H
+
+#include <linux/types.h>
+
+union rxe_gid {
+       __u8    raw[16];
+       struct {
+               __be64  subnet_prefix;
+               __be64  interface_id;
+       } global;
+};
+
+struct rxe_global_route {
+       union rxe_gid   dgid;
+       __u32           flow_label;
+       __u8            sgid_index;
+       __u8            hop_limit;
+       __u8            traffic_class;
+};
+
+struct rxe_av {
+       __u8                    port_num;
+       __u8                    network_type;
+       struct rxe_global_route grh;
+       union {
+               struct sockaddr         _sockaddr;
+               struct sockaddr_in      _sockaddr_in;
+               struct sockaddr_in6     _sockaddr_in6;
+       } sgid_addr, dgid_addr;
+};
+
+struct rxe_send_wr {
+       __u64                   wr_id;
+       __u32                   num_sge;
+       __u32                   opcode;
+       __u32                   send_flags;
+       union {
+               __be32          imm_data;
+               __u32           invalidate_rkey;
+       } ex;
+       union {
+               struct {
+                       __u64   remote_addr;
+                       __u32   rkey;
+               } rdma;
+               struct {
+                       __u64   remote_addr;
+                       __u64   compare_add;
+                       __u64   swap;
+                       __u32   rkey;
+               } atomic;
+               struct {
+                       __u32   remote_qpn;
+                       __u32   remote_qkey;
+                       __u16   pkey_index;
+               } ud;
+               struct {
+                       struct ib_mr *mr;
+                       __u32        key;
+                       int          access;
+               } reg;
+       } wr;
+};
+
+struct rxe_sge {
+       __u64   addr;
+       __u32   length;
+       __u32   lkey;
+};
+
+struct mminfo {
+       __u64                   offset;
+       __u32                   size;
+       __u32                   pad;
+};
+
+struct rxe_dma_info {
+       __u32                   length;
+       __u32                   resid;
+       __u32                   cur_sge;
+       __u32                   num_sge;
+       __u32                   sge_offset;
+       union {
+               __u8            inline_data[0];
+               struct rxe_sge  sge[0];
+       };
+};
+
+struct rxe_send_wqe {
+       struct rxe_send_wr      wr;
+       struct rxe_av           av;
+       __u32                   status;
+       __u32                   state;
+       __u64                   iova;
+       __u32                   mask;
+       __u32                   first_psn;
+       __u32                   last_psn;
+       __u32                   ack_length;
+       __u32                   ssn;
+       __u32                   has_rd_atomic;
+       struct rxe_dma_info     dma;
+};
+
+struct rxe_recv_wqe {
+       __u64                   wr_id;
+       __u32                   num_sge;
+       __u32                   padding;
+       struct rxe_dma_info     dma;
+};
+
+#endif /* RDMA_USER_RXE_H */